import requests
from bs4 import BeautifulSoup
import gspread
from google.oauth2.service_account import Credentials
import os
import urllib.parse

# -----------------------------
# Google API / Drive Setup
# -----------------------------
SERVICE_ACCOUNT_FILE = 'service_account.json'  # your existing credentials
SCOPES = ['https://www.googleapis.com/auth/spreadsheets',
          'https://www.googleapis.com/auth/drive']

credentials = Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES)

gc = gspread.authorize(credentials)

# Create or open a Google Sheet
try:
    sh = gc.open("SVO_Hybrid_Data")
except gspread.SpreadsheetNotFound:
    sh = gc.create("SVO_Hybrid_Data")

worksheet = sh.sheet1
worksheet.clear()
worksheet.append_row([
    "Genus", "Hybrid Name", "Parent1", "Parent2", "Year", "Breeder Notes", "Image URLs"
])

# -----------------------------
# Helper: Download image to Drive folder
# -----------------------------
def download_image_to_folder(url, save_folder="SVO_Images"):
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    try:
        filename = os.path.join(save_folder, urllib.parse.unquote(url.split("/")[-1]))
        img_data = requests.get(url).content
        with open(filename, "wb") as f:
            f.write(img_data)
        return filename
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return None

# -----------------------------
# Main scraper
# -----------------------------
def scrape_svo(base_url, genera, years, pages_per_genus=5):
    """
    Scrape hybrids from multiple genera, years, and pages.
    base_url: SVO page URL for genus offerings
    genera: list of genera (e.g., ["Sarcochilus", "Cattleya"])
    years: list of years
    pages_per_genus: max pages to iterate per genus
    """
    for genus in genera:
        for year in years:
            for page in range(1, pages_per_genus + 1):
                url = f"{base_url}?genus={genus}&year={year}&page={page}"
                print(f"Scraping: {url}")
                try:
                    resp = requests.get(url)
                    if resp.status_code != 200:
                        print(f"Failed to fetch {url}")
                        continue
                    soup = BeautifulSoup(resp.text, 'html.parser')

                    # TODO: Replace selectors based on actual HTML structure
                    hybrids = soup.select(".hybrid-row")  # placeholder
                    if not hybrids:
                        break  # no more hybrids, stop paging

                    for hybrid in hybrids:
                        try:
                            name = hybrid.select_one(".hybrid-name").text.strip()
                            parent1 = hybrid.select_one(".parent1").text.strip()
                            parent2 = hybrid.select_one(".parent2").text.strip()
                            notes = hybrid.select_one(".breeder-notes").text.strip()
                            images = [img['src'] for img in hybrid.select("img")]
                            downloaded_files = []
                            for img_url in images:
                                local_path = download_image_to_folder(img_url)
                                if local_path:
                                    downloaded_files.append(local_path)
                            image_urls = ", ".join(downloaded_files)

                            worksheet.append_row([genus, name, parent1, parent2, year, notes, image_urls])
                        except Exception as e:
                            print(f"Error parsing hybrid: {e}")
                except Exception as e:
                    print(f"Error fetching page: {e}")

# -----------------------------
# Run the scraper
# -----------------------------
BASE_URL = "https://www.sunsetvalleyorchids.com/htm/offerings_sarcochilus.html"  # starting URL
GENERAS = ["Sarcochilus", "Cattleya", "Paphiopedilum", "Dendrobium", "Zygopetalum"]
YEARS = [2013, 2014, 2015, 2016, 2017, 2018, 2019]

scrape_svo(BASE_URL, GENERAS, YEARS, pages_per_genus=5)

print("Scraping complete. Google Sheet and image folder ready for AI analysis.")