import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import logging
import re

logger = logging.getLogger(__name__)


class SEOAuditor:
    """Audit a website or individual pages for common SEO issues."""

    def __init__(self, site_url):
        self.site_url = site_url.rstrip("/")
        self.domain = urlparse(self.site_url).netloc

    # ------------------------------------------------------------------
    def audit_page(self, url):
        """Run a full SEO audit on a single page."""
        try:
            resp = requests.get(url, timeout=15, headers={
                "User-Agent": "Core365-SEO-Auditor/1.0"
            })
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "lxml")

            # Title
            title_tag = soup.find("title")
            title = title_tag.get_text(strip=True) if title_tag else ""
            title_length = len(title)

            # Meta description
            meta_desc_tag = soup.find("meta", attrs={"name": "description"})
            meta_desc = meta_desc_tag.get("content", "") if meta_desc_tag else ""
            meta_desc_length = len(meta_desc)

            # Headings
            headings = {}
            for level in range(1, 7):
                tags = soup.find_all(f"h{level}")
                if tags:
                    headings[f"h{level}"] = [t.get_text(strip=True) for t in tags]

            # Images
            images = soup.find_all("img")
            images_total = len(images)
            images_without_alt = len([
                img for img in images
                if not img.get("alt") or img.get("alt", "").strip() == ""
            ])

            # Links
            all_links = soup.find_all("a", href=True)
            internal_links = []
            external_links = []
            for link in all_links:
                href = link.get("href", "")
                full_url = urljoin(url, href)
                parsed = urlparse(full_url)
                if parsed.netloc == self.domain:
                    internal_links.append(full_url)
                elif parsed.scheme in ("http", "https"):
                    external_links.append(full_url)

            # Word count (body text)
            body = soup.find("body")
            text = body.get_text(separator=" ", strip=True) if body else ""
            word_count = len(text.split())

            # Calculate SEO score
            score = 100
            issues = []

            # Title checks
            if not title:
                score -= 20
                issues.append("Missing page title")
            elif title_length < 30:
                score -= 10
                issues.append(f"Title too short ({title_length} chars, aim for 50-60)")
            elif title_length > 60:
                score -= 5
                issues.append(f"Title too long ({title_length} chars, aim for 50-60)")

            # Meta description checks
            if not meta_desc:
                score -= 15
                issues.append("Missing meta description")
            elif meta_desc_length < 120:
                score -= 5
                issues.append(f"Meta description too short ({meta_desc_length} chars, aim for 150-155)")
            elif meta_desc_length > 160:
                score -= 5
                issues.append(f"Meta description too long ({meta_desc_length} chars, aim for 150-155)")

            # Heading checks
            if "h1" not in headings:
                score -= 15
                issues.append("Missing H1 heading")
            elif len(headings.get("h1", [])) > 1:
                score -= 10
                issues.append(f"Multiple H1 headings ({len(headings['h1'])}). Use only one.")

            if "h2" not in headings:
                score -= 5
                issues.append("No H2 subheadings found")

            # Image checks
            if images_without_alt > 0:
                penalty = min(images_without_alt * 3, 15)
                score -= penalty
                issues.append(f"{images_without_alt} of {images_total} images missing alt text")

            # Content length
            if word_count < 300:
                score -= 15
                issues.append(f"Content too thin ({word_count} words, aim for 1000+)")
            elif word_count < 800:
                score -= 5
                issues.append(f"Content could be longer ({word_count} words, aim for 1000+)")

            # Internal links
            if len(internal_links) < 2:
                score -= 5
                issues.append("Few internal links. Add more for better SEO.")

            score = max(0, score)

            return {
                "url": url,
                "title": title,
                "title_length": title_length,
                "meta_description": meta_desc,
                "meta_description_length": meta_desc_length,
                "headings": headings,
                "images_total": images_total,
                "images_without_alt": images_without_alt,
                "internal_links": len(internal_links),
                "external_links": len(external_links),
                "word_count": word_count,
                "score": score,
                "issues": issues,
            }

        except Exception as e:
            logger.error(f"Failed to audit {url}: {e}")
            return {"url": url, "error": str(e), "score": 0, "issues": [str(e)]}

    # ------------------------------------------------------------------
    def audit_site(self, sitemap_url=None):
        """Audit all pages found in the sitemap."""
        urls = self._get_urls_from_sitemap(sitemap_url)
        if not urls:
            # Fall back to just the homepage
            urls = [self.site_url]

        results = []
        for url in urls[:20]:  # Limit to 20 pages to avoid timeouts
            results.append(self.audit_page(url))
        return results

    # ------------------------------------------------------------------
    def _get_urls_from_sitemap(self, sitemap_url=None):
        """Parse sitemap.xml to get page URLs."""
        if not sitemap_url:
            sitemap_url = f"{self.site_url}/sitemap.xml"
        try:
            resp = requests.get(sitemap_url, timeout=15)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.content, "lxml-xml")
            urls = [loc.text for loc in soup.find_all("loc")]
            return urls
        except Exception as e:
            logger.error(f"Failed to parse sitemap: {e}")
            return []

    # ------------------------------------------------------------------
    def check_robots_txt(self):
        """Check robots.txt."""
        try:
            resp = requests.get(f"{self.site_url}/robots.txt", timeout=10)
            if resp.status_code == 200:
                return {"exists": True, "content": resp.text[:1000]}
            return {"exists": False, "content": ""}
        except Exception as e:
            return {"exists": False, "error": str(e)}

    # ------------------------------------------------------------------
    def check_sitemap(self):
        """Check if sitemap.xml exists."""
        try:
            resp = requests.get(f"{self.site_url}/sitemap.xml", timeout=10)
            if resp.status_code == 200:
                soup = BeautifulSoup(resp.content, "lxml-xml")
                urls = soup.find_all("loc")
                return {"exists": True, "url_count": len(urls)}
            return {"exists": False, "url_count": 0}
        except Exception as e:
            return {"exists": False, "error": str(e)}
