import requests
from bs4 import BeautifulSoup
import gspread
from google.oauth2.service_account import Credentials
import os
import urllib.parse

# -----------------------------
# Google API / Drive Setup
# -----------------------------
SERVICE_ACCOUNT_FILE = 'service_account.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets',
          'https://www.googleapis.com/auth/drive']

credentials = Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES)

gc = gspread.authorize(credentials)

# Create or open a Google Sheet
try:
    sh = gc.open("SVO_Hybrid_Data")
except gspread.SpreadsheetNotFound:
    sh = gc.create("SVO_Hybrid_Data")

worksheet = sh.sheet1
worksheet.clear()
worksheet.append_row([
    "Genus", "Hybrid Name", "Parent1", "Parent2", "Year", "Breeder Notes", "Image Paths"
])

# -----------------------------
# Helper: Download image
# -----------------------------
def download_image(url, save_folder="SVO_Images"):
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    try:
        filename = os.path.join(save_folder, urllib.parse.unquote(url.split("/")[-1]))
        img_data = requests.get(url).content
        with open(filename, "wb") as f:
            f.write(img_data)
        return filename
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return None

# -----------------------------
# Helper: Dynamic parsing of hybrid rows
# -----------------------------
def parse_hybrid(soup):
    hybrids = []
    for table in soup.find_all("table"):  # iterate tables to detect hybrid info
        rows = table.find_all("tr")
        for row in rows:
            cells = row.find_all(["td", "th"])
            if len(cells) < 2:
                continue
            hybrid_data = {}
            for cell in cells:
                text = cell.get_text(strip=True)
                if "Hybrid" in text or "Sarc." in text or "Cross" in text:
                    hybrid_data['name'] = text
                if "x" in text:  # detect parentage
                    parts = text.split("x")
                    hybrid_data['parent1'] = parts[0].strip()
                    hybrid_data['parent2'] = parts[1].strip() if len(parts) > 1 else ""
            # Detect images dynamically
            imgs = row.find_all("img")
            hybrid_data['images'] = [img['src'] for img in imgs if img.get('src')]
            if hybrid_data:
                hybrids.append(hybrid_data)
    return hybrids

# -----------------------------
# Main Scraper
# -----------------------------
def scrape_svo_dynamic(base_url, genera, years, max_pages=5):
    for genus in genera:
        for year in years:
            for page in range(1, max_pages + 1):
                url = f"{base_url}?genus={genus}&year={year}&page={page}"
                print(f"Scraping: {url}")
                try:
                    resp = requests.get(url)
                    if resp.status_code != 200:
                        print(f"Failed to fetch {url}")
                        continue
                    soup = BeautifulSoup(resp.text, 'html.parser')
                    hybrids = parse_hybrid(soup)
                    if not hybrids:
                        break  # no more hybrids on this page

                    for h in hybrids:
                        downloaded_files = [download_image(img) for img in h.get('images', [])]
                        worksheet.append_row([
                            genus,
                            h.get('name', ''),
                            h.get('parent1', ''),
                            h.get('parent2', ''),
                            year,
                            "",  # placeholder for breeder notes if not detected
                            ", ".join(filter(None, downloaded_files))
                        ])
                except Exception as e:
                    print(f"Error scraping page {url}: {e}")

# -----------------------------
# Run the Dynamic Scraper
# -----------------------------
BASE_URL = "https://www.sunsetvalleyorchids.com/htm/offerings_sarcochilus.html"
GENERAS = ["Sarcochilus", "Cattleya", "Paphiopedilum", "Dendrobium", "Zygopetalum"]
YEARS = [2013, 2014, 2015, 2016, 2017, 2018, 2019]

scrape_svo_dynamic(BASE_URL, GENERAS, YEARS, max_pages=5)

print("Dynamic scraping complete. Sheet and images ready for AI analysis.")