# ============================================================
# Fully Self-Contained SVO Scraper + Analyzer
# ============================================================

import asyncio
import aiohttp
from bs4 import BeautifulSoup
import re
from collections import Counter
import matplotlib.pyplot as plt
import csv
import os
import logging

# ---------------------------
# CONFIGURATION
# ---------------------------
URLS = [
    "https://example.com/page1",
    "https://example.com/page2"
]

OUTPUT_DIR = "output"
CSV_FILENAME = "svo_results.csv"

MAX_RETRIES = 3
TIMEOUT = 10

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

# ---------------------------
# SCRAPER / FETCHER
# ---------------------------
async def fetch(session, url):
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            async with session.get(url, timeout=TIMEOUT) as response:
                response.raise_for_status()
                text = await response.text()
                logging.info(f"Fetched {url}")
                return text
        except Exception as e:
            logging.warning(f"Attempt {attempt} failed for {url}: {e}")
            if attempt == MAX_RETRIES:
                logging.error(f"Failed to fetch {url} after {MAX_RETRIES} attempts")
                return None

async def fetch_all(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch(session, url) for url in urls]
        results = await asyncio.gather(*tasks)
    return [r for r in results if r]

# ---------------------------
# PARSER
# ---------------------------
def parse_svo(raw_html_list):
    svo_list = []
    for html in raw_html_list:
        try:
            soup = BeautifulSoup(html, "html.parser")
            sentences = soup.get_text().split(".")
            for sentence in sentences:
                words = sentence.split()
                if len(words) >= 3:
                    svo_list.append((words[0], words[1], " ".join(words[2:])))
        except Exception as e:
            logging.error(f"Error parsing HTML: {e}")
    logging.info(f"Parsed {len(svo_list)} SVO tuples")
    return svo_list

# ---------------------------
# UTILS
# ---------------------------
def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.strip()

# ---------------------------
# PROCESSOR
# ---------------------------
def clean_svo(svo_list):
    cleaned = []
    for subj, verb, obj in svo_list:
        cleaned.append((
            clean_text(subj).lower(),
            clean_text(verb).lower(),
            clean_text(obj).lower()
        ))
    return cleaned

# ---------------------------
# ANALYZER
# ---------------------------
def analyze_svo(cleaned_svo):
    subjects = [s for s, v, o in cleaned_svo]
    verbs = [v for s, v, o in cleaned_svo]
    objects = [o for s, v, o in cleaned_svo]

    analysis = {
        "subject_freq": Counter(subjects),
        "verb_freq": Counter(verbs),
        "object_freq": Counter(objects),
        "total_svo": len(cleaned_svo)
    }
    return analysis

# ---------------------------
# VISUALIZER
# ---------------------------
def visualize_svo(analysis):
    for key in ["subject_freq", "verb_freq", "object_freq"]:
        counter = analysis[key]
        top_items = counter.most_common(10)
        if not top_items:
            continue
        labels, counts = zip(*top_items)
        plt.figure(figsize=(10,5))
        plt.bar(labels, counts, color='skyblue')
        plt.title(f"Top 10 {key}")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

# ---------------------------
# STORAGE / SAVE RESULTS
# ---------------------------
def save_results(cleaned_svo, analysis):
    try:
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        csv_path = os.path.join(OUTPUT_DIR, CSV_FILENAME)
        with open(csv_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Subject", "Verb", "Object"])
            writer.writerows(cleaned_svo)
        logging.info(f"SVO data saved to {csv_path}")

        summary_path = os.path.join(OUTPUT_DIR, "analysis_summary.txt")
        with open(summary_path, "w", encoding="utf-8") as f:
            for key, counter in analysis.items():
                f.write(f"{key}:\n{counter}\n\n")
        logging.info(f"Analysis summary saved to {summary_path}")
    except Exception as e:
        logging.error(f"Error saving results: {e}")

# ---------------------------
# MAIN WORKFLOW
# ---------------------------
def main():
    raw_data = asyncio.run(fetch_all(URLS))
    if not raw_data:
        print("No data fetched. Exiting.")
        return
    svo_data = parse_svo(raw_data)
    if not svo_data:
        print("No SVO tuples extracted. Exiting.")
        return
    cleaned_svo = clean_svo(svo_data)
    analysis_results = analyze_svo(cleaned_svo)
    visualize_svo(analysis_results)
    save_results(cleaned_svo, analysis_results)
    print("SVO Scraper + Analyzer finished successfully.")

if __name__ == "__main__":
    main()