import requests
from bs4 import BeautifulSoup
import gspread
from google.oauth2.service_account import Credentials
import os
import urllib.parse
import time

# -----------------------------
# Google API / Drive Setup
# -----------------------------
SERVICE_ACCOUNT_FILE = 'service_account.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets',
          'https://www.googleapis.com/auth/drive']

credentials = Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES)
gc = gspread.authorize(credentials)

# Create or open a Google Sheet
try:
    sh = gc.open("SVO_Hybrid_Data")
except gspread.SpreadsheetNotFound:
    sh = gc.create("SVO_Hybrid_Data")

worksheet = sh.sheet1
worksheet.clear()
worksheet.append_row([
    "Genus", "Hybrid Name", "Parent1", "Parent2", "Year", "Breeder Notes", "Image Paths"
])

# -----------------------------
# Helper: Download image
# -----------------------------
def download_image(url, save_folder="SVO_Images"):
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    try:
        filename = os.path.join(save_folder, urllib.parse.unquote(url.split("/")[-1]))
        img_data = requests.get(url).content
        with open(filename, "wb") as f:
            f.write(img_data)
        return filename
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return None

# -----------------------------
# Helper: Parse hybrid info
# -----------------------------
def parse_hybrid(soup):
    hybrids = []
    for table in soup.find_all("table"):
        rows = table.find_all("tr")
        for row in rows:
            hybrid_data = {"images": []}
            cells = row.find_all(["td", "th"])
            for cell in cells:
                text = cell.get_text(strip=True)
                if "Hybrid" in text or "Sarc." in text or "Cross" in text:
                    hybrid_data['name'] = text
                if "x" in text:  # detect parentage
                    parts = text.split("x")
                    hybrid_data['parent1'] = parts[0].strip()
                    hybrid_data['parent2'] = parts[1].strip() if len(parts) > 1 else ""
                # Collect breeder notes if present
                if len(text) > 50:  # simple heuristic for notes
                    hybrid_data['notes'] = text
            # Images
            imgs = row.find_all("img")
            hybrid_data['images'] = [img['src'] for img in imgs if img.get('src')]
            if hybrid_data.get('name') or hybrid_data.get('images'):
                hybrids.append(hybrid_data)
    return hybrids

# -----------------------------
# Main Scraper
# -----------------------------
def scrape_svo_complete(base_url, genera, years, max_pages=5):
    for genus in genera:
        for year in years:
            for page in range(1, max_pages + 1):
                url = f"{base_url}?genus={genus}&year={year}&page={page}"
                print(f"Scraping: {url}")
                try:
                    resp = requests.get(url)
                    if resp.status_code != 200:
                        print(f"Failed to fetch {url}")
                        continue
                    soup = BeautifulSoup(resp.text, 'html.parser')
                    hybrids = parse_hybrid(soup)
                    if not hybrids:
                        break  # no more hybrids on this page

                    for h in hybrids:
                        downloaded_files = [download_image(img) for img in h.get('images', [])]
                        worksheet.append_row([
                            genus,
                            h.get('name', ''),
                            h.get('parent1', ''),
                            h.get('parent2', ''),
                            year,
                            h.get('notes', ''),
                            ", ".join(filter(None, downloaded_files))
                        ])
                    time.sleep(1)  # avoid hammering server
                except Exception as e:
                    print(f"Error scraping page {url}: {e}")

# -----------------------------
# Configuration & Run
# -----------------------------
BASE_URL = "https://www.sunsetvalleyorchids.com/htm/offerings_sarcochilus.html"
GENERAS = ["Sarcochilus", "Cattleya", "Paphiopedilum", "Dendrobium", "Zygopetalum"]
YEARS = list(range(2013, 2020))  # adjust based on available years

scrape_svo_complete(BASE_URL, GENERAS, YEARS, max_pages=10)

print("Scraping complete. Google Sheet and images ready for AI analysis.")