import re

import requests

import pandasai.pandas as pd


def get_google_sheet(src) -> list:
    """
    Returns a 2D array of the contents of the Google Sheet at the given URL

    Args:
        src (str): The URL of the Google Sheet.

    Returns:
        list: A 2D array representing content of the Google Sheet.
    """

    try:
        from bs4 import BeautifulSoup
    except ImportError as e:
        raise ImportError(
            "Could not import beautifulsoup4 python package. "
            "Please install it with `pip install beautifulsoup4`."
        ) from e

    # The size of the Google sheet that can be read is limited
    raw_html = requests.get(src).text

    soup = BeautifulSoup(raw_html, "html.parser")
    table = soup.find("tbody")
    rows = table.find_all("tr")
    grid = []
    for row in rows:
        cols = row.find_all("td")
        clean_row = [col.text for col in cols]
        grid.append(clean_row)
    return grid


def sheet_to_df(sheet) -> list:
    """
    Returns a list of dataframes for each data table in a given spreadsheet

    Args:
        sheet (list): A 2D array of the contents of the Google Sheet

    Returns:
        list: A list of dataframes from the Google Sheet.
    """

    # A dataframe starts when a header is found
    # A header is the first instance of a set of contiguous alphanumeric columns
    # A dataframe ends when a blank row is found or an empty column is found

    num = 0  # The number of the dataframe
    headers = []  # Each header is a tuple (num, row, col_start, col_end)
    binding_headers = []
    dfs = []  # Each df is a tuple (num, df)

    # First pass: get all the headers
    for row in range(len(sheet)):
        # if every cell in the row is empty, skip row
        if all(sheet[row][col].strip() == "" for col in range(len(sheet[row]))):
            headers += binding_headers
            binding_headers = []
            continue

        for col in range(len(sheet[row])):
            # Check if the cell is bounded by a header
            if any(col >= header[2] and col <= header[3] for header in binding_headers):
                continue

            # Check if the cell is commented out
            if sheet[row][col].strip().startswith("//"):
                continue

            if re.search("[a-zA-Z]", sheet[row][col]):
                head_start = col
                head_end = col
                while head_end < len(sheet[row]) and re.search(
                    "[a-zA-Z]", sheet[row][head_end]
                ):
                    head_end += 1
                binding_headers.append([num, row, head_start, head_end])
                num += 1
    headers += binding_headers

    # Second pass: get all the dataframes
    for header in headers:
        df = []
        for row in range(header[1], len(sheet)):
            if all(
                sheet[row][col].strip() == "" for col in range(header[2], header[3])
            ):
                break
            df_row = [sheet[row][col] for col in range(header[2], header[3])]
            df.append(df_row)
        cols = df[0]
        data = df[1:]
        df = pd.DataFrame(data, columns=cols)

        # Cast all the numeric columns to numeric types
        for col in df.columns:
            try:
                df[col] = pd.to_numeric(df[col])
            except ValueError:
                pass
        dfs.append(df)

    return dfs


def from_google_sheets(url) -> list:
    """
    Returns the dataframes that are in a Google sheet.

    Args:
        url (str): The URL of the Google Sheet
    Returns:
        list: A list of dataframes from the Google Sheet.
    """

    sheet = get_google_sheet(url)
    return sheet_to_df(sheet)