# ── stdlib ────────────────────────────────────────────────────────────────────
import json
import logging
import os
import re
import sys
import time
import urllib.robotparser
from collections import deque
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
from urllib.parse import urljoin, urldefrag, urlparse

# ── third-party ───────────────────────────────────────────────────────────────
try:
    import requests
    from bs4 import BeautifulSoup
    import trafilatura
    from dotenv import load_dotenv
except ImportError as e:
    sys.exit(
        f"[FATAL] Missing dependency: {e}\n"
        "Run: pip install -r requirements.txt"
    )

load_dotenv()

# ══════════════════════════════════════════════════════════════════════════════
# §1  CONFIGURATION
# ══════════════════════════════════════════════════════════════════════════════

BASE_URL      = os.getenv("BASE_URL",          "https://www.moweb.com/")
OUTPUT_DIR    = Path(os.getenv("OUTPUT_DIR",   "scraped_output"))
REQUEST_DELAY = float(os.getenv("REQUEST_DELAY", "1.5"))
TIMEOUT       = int(os.getenv("TIMEOUT",       "30"))
JS_THRESHOLD  = int(os.getenv("JS_WORD_THRESHOLD", "50"))  # words → assume JS page
USER_AGENT    = os.getenv("USER_AGENT",        "MowebCrawler/1.0 (internal)")
RESUME        = os.getenv("RESUME",            "true").lower() == "true"

_md = os.getenv("MAX_DEPTH", "0")
MAX_DEPTH: Optional[int] = int(_md) if _md and int(_md) > 0 else None

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

STATE_FILE = OUTPUT_DIR / ".crawl_state.json"   # hidden — internal resume file
INDEX_FILE = OUTPUT_DIR / "index.json"           # public — URL → file mapping
LOG_FILE   = OUTPUT_DIR / "crawl_log.txt"

# File extensions that are never pages
SKIP_EXTS = {
    ".pdf", ".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp", ".avif",
    ".mp4", ".mp3", ".avi", ".mov", ".webm",
    ".zip", ".rar", ".tar", ".gz", ".7z", ".exe", ".dmg",
    ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
    ".css", ".js", ".json", ".xml", ".ico",
    ".woff", ".woff2", ".ttf", ".eot", ".otf",
}
SKIP_SCHEMES = {"mailto:", "tel:", "javascript:", "data:", "ftp:", "sms:"}

# ══════════════════════════════════════════════════════════════════════════════
# §2  LOGGING
# ══════════════════════════════════════════════════════════════════════════════

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)-8s] %(message)s",
    datefmt="%H:%M:%S",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(LOG_FILE, encoding="utf-8"),
    ],
)
log = logging.getLogger("crawler")

# ══════════════════════════════════════════════════════════════════════════════
# §3  URL UTILITIES
# ══════════════════════════════════════════════════════════════════════════════

_parsed_base = urlparse(BASE_URL)
BASE_DOMAIN  = _parsed_base.netloc   # e.g. "www.moweb.com"


def normalize_url(raw: str, source_url: str = BASE_URL) -> Optional[str]:
    """
    Resolve relative → absolute, strip fragments, validate internal.
    Returns a normalized URL string or None if the URL should be skipped.
    """
    raw = raw.strip()
    if not raw:
        return None
    if any(raw.startswith(s) for s in SKIP_SCHEMES):
        return None

    raw, _ = urldefrag(raw)               # strip #anchors
    if not raw:                           # bare anchor (#section) → nothing left
        return None
    url = urljoin(source_url, raw)        # resolve relative paths
    parsed = urlparse(url)

    if parsed.netloc != BASE_DOMAIN:      # external domain — skip
        return None
    if parsed.scheme not in ("http", "https"):
        return None
    if Path(parsed.path).suffix.lower() in SKIP_EXTS:
        return None

    # Normalize: lowercase scheme+host, collapse duplicate slashes
    path = re.sub(r"/+", "/", parsed.path) or "/"
    return parsed._replace(
        scheme=parsed.scheme.lower(),
        netloc=parsed.netloc.lower(),
        path=path,
        fragment="",
    ).geturl()


def url_to_slug(url: str) -> str:
    """
    /about-us/       → about-us
    /services/web    → services__web
    / (homepage)     → home
    """
    path = urlparse(url).path.strip("/")
    if not path:
        return "home"
    slug = path.replace("/", "__")
    slug = re.sub(r"[^\w\-]", "-", slug)
    slug = re.sub(r"-{2,}", "-", slug).strip("-")
    return slug or "home"


# ══════════════════════════════════════════════════════════════════════════════
# §4  ROBOTS.TXT  (warn only — caller owns the site)
# ══════════════════════════════════════════════════════════════════════════════

def load_robots(base_url: str) -> urllib.robotparser.RobotFileParser:
    rp = urllib.robotparser.RobotFileParser()
    robots_url = urljoin(base_url, "/robots.txt")
    try:
        rp.set_url(robots_url)
        rp.read()
        log.info("robots.txt loaded: %s", robots_url)
    except Exception as e:
        log.warning("Could not fetch robots.txt: %s", e)
    return rp


def robots_check(rp: urllib.robotparser.RobotFileParser, url: str) -> bool:
    """Returns True if allowed. Logs a warning when disallowed but never blocks."""
    if not rp.can_fetch(USER_AGENT, url):
        log.warning("[ROBOTS] Disallowed (crawling anyway): %s", url)
        return False
    return True


# ══════════════════════════════════════════════════════════════════════════════
# §5  HTTP SESSION
# ══════════════════════════════════════════════════════════════════════════════

def build_session() -> requests.Session:
    s = requests.Session()
    s.headers.update({
        "User-Agent":      USER_AGENT,
        "Accept":          "text/html,application/xhtml+xml,*/*;q=0.9",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
    })
    # Retry once on connection errors
    adapter = requests.adapters.HTTPAdapter(max_retries=1)
    s.mount("http://",  adapter)
    s.mount("https://", adapter)
    return s


# ══════════════════════════════════════════════════════════════════════════════
# §6  STATIC FETCH (requests)
# ══════════════════════════════════════════════════════════════════════════════

def fetch_static(
    session: requests.Session, url: str
) -> tuple[Optional[str], Optional[str]]:
    """
    Fetch via requests. Returns (html, final_url) or (None, final_url) on error.
    final_url reflects any redirects.
    """
    try:
        resp = session.get(url, timeout=TIMEOUT, allow_redirects=True)
        final_url = resp.url

        if resp.status_code != 200:
            log.warning("[HTTP %d] %s", resp.status_code, url)
            return None, final_url

        ct = resp.headers.get("content-type", "")
        if "text/html" not in ct and "application/xhtml" not in ct:
            log.debug("[SKIP non-HTML] %s (%s)", url, ct)
            return None, final_url

        return resp.text, final_url

    except requests.exceptions.Timeout:
        log.error("[TIMEOUT] %s", url)
    except requests.exceptions.TooManyRedirects:
        log.error("[REDIRECT LOOP] %s", url)
    except requests.exceptions.ConnectionError as e:
        log.error("[CONN ERROR] %s — %s", url, e)
    except requests.exceptions.RequestException as e:
        log.error("[REQUEST ERROR] %s — %s", url, e)

    return None, None


# ══════════════════════════════════════════════════════════════════════════════
# §7  JS-RENDERING DETECTION
# ══════════════════════════════════════════════════════════════════════════════

def count_visible_words(html: str) -> int:
    """Count words in visible text after stripping all tags and non-content nodes."""
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript", "head", "meta", "link"]):
        tag.decompose()
    return len(soup.get_text(separator=" ").split())


def needs_js_render(html: str) -> bool:
    words = count_visible_words(html)
    log.debug("Visible word count (static): %d  [threshold: %d]", words, JS_THRESHOLD)
    return words < JS_THRESHOLD


# ══════════════════════════════════════════════════════════════════════════════
# §8  PLAYWRIGHT FETCH  (lazy init, browser reused across pages)
# ══════════════════════════════════════════════════════════════════════════════

_pw_context  = None   # sync_playwright() context manager
_pw_browser  = None   # Chromium browser instance
_pw_disabled = False  # set True if Playwright is not installed


def _init_playwright():
    global _pw_context, _pw_browser, _pw_disabled
    if _pw_disabled or _pw_browser is not None:
        return
    try:
        from playwright.sync_api import sync_playwright
        _pw_context = sync_playwright().__enter__()
        _pw_browser = _pw_context.chromium.launch(headless=True)
        log.info("[PLAYWRIGHT] Browser initialized (Chromium, headless)")
    except ImportError:
        log.warning(
            "[PLAYWRIGHT] Not installed — JS fallback disabled.\n"
            "  Fix: pip install playwright && playwright install chromium"
        )
        _pw_disabled = True
    except Exception as e:
        log.error("[PLAYWRIGHT] Init failed: %s", e)
        _pw_disabled = True


def close_playwright():
    global _pw_context, _pw_browser
    if _pw_browser:
        try:
            _pw_browser.close()
            _pw_context.__exit__(None, None, None)
        except Exception:
            pass
        _pw_browser = None
        _pw_context = None
        log.info("[PLAYWRIGHT] Browser closed.")


def fetch_with_playwright(url: str) -> Optional[str]:
    global _pw_disabled
    if _pw_disabled:
        return None
    _init_playwright()
    if _pw_disabled or _pw_browser is None:
        return None

    try:
        from playwright.sync_api import TimeoutError as PWTimeout
        page = _pw_browser.new_page()
        page.set_extra_http_headers({"User-Agent": USER_AGENT})
        page.goto(url, wait_until="networkidle", timeout=TIMEOUT * 1000)
        html = page.content()
        page.close()
        return html
    except Exception as e:
        # Catch both PWTimeout and any other Playwright error
        log.error("[PLAYWRIGHT] Failed on %s — %s", url, e)
        try:
            page.close()
        except Exception:
            pass
        return None


# ══════════════════════════════════════════════════════════════════════════════
# §9  CONTENT EXTRACTION
# ══════════════════════════════════════════════════════════════════════════════

def extract_title(soup: BeautifulSoup) -> str:
    og_title = soup.find("meta", property="og:title")
    if og_title and og_title.get("content"):
        return og_title["content"].strip()
    tag = soup.find("title")
    return tag.get_text(strip=True) if tag else "Untitled"


def extract_via_trafilatura(html: str, url: str) -> Optional[str]:
    """
    trafilatura is purpose-built for extracting article/page body
    while discarding nav, headers, footers, and ads.
    """
    return trafilatura.extract(
        html,
        url=url,
        include_comments=False,
        include_tables=True,
        no_fallback=False,
        favor_recall=True,       # permissive → less likely to miss real content
        output_format="txt",
    )


_HEADING_MAP = {"h1": "#", "h2": "##", "h3": "###", "h4": "####", "h5": "#####", "h6": "######"}

def extract_via_bs4(soup: BeautifulSoup) -> str:
    """
    Heuristic fallback extraction.
    Removes structural chrome (nav, header, footer, sidebar, forms)
    then serializes semantic content tags preserving hierarchy.
    """
    # --- strip non-content structural elements ---
    remove_tags = [
        "script", "style", "noscript", "head", "meta", "link",
        "nav", "header", "footer", "aside",
        "form", "button", "input", "select", "textarea",
        "iframe", "embed", "object",
    ]
    # Also strip by common structural class/id patterns
    noise_patterns = re.compile(
        r"(nav|navbar|navigation|menu|header|footer|sidebar|breadcrumb"
        r"|cookie|banner|popup|modal|overlay|social|share|ad-|ads-"
        r"|widget|skip-link|back-to-top)",
        re.IGNORECASE,
    )
    for tag in soup.find_all(True):
        classes = " ".join(tag.get("class", []))
        id_attr = tag.get("id", "")
        if tag.name in remove_tags or noise_patterns.search(classes) or noise_patterns.search(id_attr):
            tag.decompose()

    lines = []
    seen  = set()

    content_tags = ["h1","h2","h3","h4","h5","h6","p","li","td","th","blockquote","dt","dd"]
    for elem in soup.find_all(content_tags):
        text = elem.get_text(separator=" ", strip=True)
        text = re.sub(r"\s{2,}", " ", text)
        if not text or len(text) < 4 or text in seen:
            continue
        seen.add(text)

        if elem.name in _HEADING_MAP:
            prefix = _HEADING_MAP[elem.name]
            lines.append(f"\n{prefix} {text}\n")
        elif elem.name == "li":
            lines.append(f"  • {text}")
        elif elem.name == "blockquote":
            lines.append(f"\n> {text}\n")
        else:
            lines.append(text)

    return "\n".join(lines)


def extract_page(html: str, url: str) -> tuple[str, str]:
    """
    Returns (title, clean_body_text).
    Tries trafilatura → falls back to BS4 heuristic if result is too sparse.
    """
    soup  = BeautifulSoup(html, "lxml")
    title = extract_title(soup)

    body = extract_via_trafilatura(html, url) or ""
    if len(body.split()) < 25:
        log.debug("[EXTRACT] trafilatura sparse → using BS4 fallback for %s", url)
        body = extract_via_bs4(soup)

    return title, body.strip()


# ══════════════════════════════════════════════════════════════════════════════
# §10  LINK EXTRACTION
# ══════════════════════════════════════════════════════════════════════════════

def extract_links(html: str, source_url: str) -> list[str]:
    """Return all valid, normalized internal links found in the page."""
    soup  = BeautifulSoup(html, "lxml")
    links = []
    for a in soup.find_all("a", href=True):
        norm = normalize_url(a["href"], source_url)
        if norm:
            links.append(norm)
    return links


# ══════════════════════════════════════════════════════════════════════════════
# §11  OUTPUT WRITER
# ══════════════════════════════════════════════════════════════════════════════

def write_page(url: str, title: str, body: str) -> str:
    """
    Write page content to a .txt file with metadata header.
    Returns the filename used.
    Handles slug collisions between different URLs with a numeric suffix.
    """
    slug     = url_to_slug(url)
    filename = f"{slug}.txt"
    out_path = OUTPUT_DIR / filename

    # Collision guard: two different URL paths → same slug
    counter = 1
    while out_path.exists():
        filename = f"{slug}_{counter}.txt"
        out_path  = OUTPUT_DIR / filename
        counter  += 1

    divider   = "─" * 30
    header    = (
        f"TITLE : {title}\n"
        f"URL   : {url}\n"
        f"{divider}\n\n"
    )

    out_path.write_text(header + body, encoding="utf-8")
    return filename


# ══════════════════════════════════════════════════════════════════════════════
# §12  STATE MANAGEMENT (resume support)
# ══════════════════════════════════════════════════════════════════════════════

def load_state() -> tuple[set, deque, dict]:
    """
    Load previously saved crawl state.
    Returns (visited_set, url_queue, index_dict).
    """
    if RESUME and STATE_FILE.exists():
        try:
            raw     = json.loads(STATE_FILE.read_text(encoding="utf-8"))
            visited = set(raw.get("visited", []))
            queue   = deque(tuple(item) for item in raw.get("queue", []))
            index   = raw.get("index", {})
            log.info("Resumed: %d visited URLs, %d in queue", len(visited), len(queue))
            return visited, queue, index
        except Exception as e:
            log.warning("Could not load crawl state (%s) — starting fresh", e)

    return set(), deque(), {}


def save_state(visited: set, queue: deque, index: dict):
    """Atomically persist crawl state (write → rename)."""
    tmp = STATE_FILE.with_suffix(".tmp")
    tmp.write_text(
        json.dumps(
            {"visited": list(visited), "queue": list(queue), "index": index},
            indent=2,
        ),
        encoding="utf-8",
    )
    tmp.replace(STATE_FILE)


# ══════════════════════════════════════════════════════════════════════════════
# §13  MAIN CRAWLER LOOP
# ══════════════════════════════════════════════════════════════════════════════

def crawl():
    log.info("═" * 60)
    log.info("  Moweb Crawler — Starting")
    log.info("  Base URL  : %s", BASE_URL)
    log.info("  Output    : %s", OUTPUT_DIR.resolve())
    log.info("  Max depth : %s", MAX_DEPTH if MAX_DEPTH else "unlimited")
    log.info("  Delay     : %.1fs | Timeout: %ds", REQUEST_DELAY, TIMEOUT)
    log.info("  Resume    : %s", RESUME)
    log.info("═" * 60)

    session         = build_session()
    robots          = load_robots(BASE_URL)
    visited, queue, index = load_state()

    # Seed queue on fresh start
    if not queue:
        seed = normalize_url(BASE_URL) or BASE_URL
        queue.append((seed, 0))   # (url, depth)

    pages_ok  = 0
    pages_err = 0
    pages_skip = 0

    try:
        while queue:
            url, depth = queue.popleft()

            # ── Deduplication ─────────────────────────────────────────────
            if url in visited:
                continue

            # ── Depth guard ───────────────────────────────────────────────
            if MAX_DEPTH is not None and depth > MAX_DEPTH:
                log.debug("[DEPTH LIMIT] Skipping (depth=%d): %s", depth, url)
                pages_skip += 1
                continue

            visited.add(url)
            log.info(
                "[Q:%d D:%d Done:%d] %s",
                len(queue), depth, pages_ok, url,
            )

            # ── robots.txt advisory check ─────────────────────────────────
            robots_check(robots, url)

            # ── Static fetch ──────────────────────────────────────────────
            html, final_url = fetch_static(session, url)

            if html is None:
                pages_err += 1
                index[url] = {"status": "error", "file": None, "depth": depth}
                save_state(visited, queue, index)
                time.sleep(REQUEST_DELAY)
                continue

            # Redirect: add final URL to visited so we don't re-crawl it
            if final_url and final_url != url:
                norm_final = normalize_url(final_url)
                if norm_final and norm_final not in visited:
                    log.info("  ↳ Redirect: %s → %s", url, norm_final)
                    visited.add(norm_final)
                    index[url] = {"status": "redirect", "to": norm_final, "file": None, "depth": depth}
                    url = norm_final  # use final URL for extraction + indexing

            # ── JS detection → Playwright fallback ────────────────────────
            used_playwright = False
            if needs_js_render(html):
                log.info("  ↳ [JS DETECT] Low word count — trying Playwright")
                pw_html = fetch_with_playwright(url)
                if pw_html:
                    html            = pw_html
                    used_playwright = True
                    log.info("  ↳ [PLAYWRIGHT] Rendered successfully")
                else:
                    log.warning("  ↳ [PLAYWRIGHT] Failed — using sparse static HTML")

            # ── Content extraction ────────────────────────────────────────
            title, body = extract_page(html, url)

            if not body.strip():
                log.warning("  ↳ [EMPTY] No content extracted — skipping file write")
                pages_skip += 1
                index[url] = {"status": "empty", "file": None, "depth": depth}
            else:
                filename = write_page(url, title, body)
                word_count = len(body.split())
                log.info("  ↳ Saved: %-40s (%d words)", filename, word_count)
                pages_ok += 1
                index[url] = {
                    "status":     "ok",
                    "file":       filename,
                    "title":      title,
                    "depth":      depth,
                    "words":      word_count,
                    "playwright": used_playwright,
                }

            # ── Link discovery ────────────────────────────────────────────
            links     = extract_links(html, url)
            new_count = 0
            for link in links:
                if link not in visited:
                    queue.append((link, depth + 1))
                    new_count += 1
            log.debug("  ↳ Found %d new links", new_count)

            # ── Persist state + polite delay ──────────────────────────────
            save_state(visited, queue, index)
            time.sleep(REQUEST_DELAY)

    except KeyboardInterrupt:
        log.info("\n[INTERRUPTED] Crawl state saved. Re-run to resume.")

    finally:
        close_playwright()
        # Write final index.json
        INDEX_FILE.write_text(
            json.dumps(index, indent=2, ensure_ascii=False), encoding="utf-8"
        )
        # Summary
        log.info("═" * 60)
        log.info("  Crawl finished")
        log.info("  Pages saved : %d", pages_ok)
        log.info("  Errors      : %d", pages_err)
        log.info("  Skipped     : %d", pages_skip)
        log.info("  Index       : %s", INDEX_FILE)
        log.info("  Log         : %s", LOG_FILE)
        log.info("═" * 60)


# ══════════════════════════════════════════════════════════════════════════════
# §14  ENTRY POINT
# ══════════════════════════════════════════════════════════════════════════════

if __name__ == "__main__":
    crawl()
