from __future__ import annotations

import os
import sys
import msal
import requests
from bs4 import BeautifulSoup
import re
from pathlib import Path
from urllib.parse import unquote, urlparse, parse_qs
import io
import base64
import subprocess
import zipfile
import xml.etree.ElementTree as ET
try:
    import pytesseract  # OCR
    from PIL import Image, ImageOps, ImageFilter
except Exception:
    pytesseract = None
    Image = None
try:
    from pypdf import PdfReader as PdfReaderNew
except Exception:
    PdfReaderNew = None
try:
    from PyPDF2 import PdfReader as PdfReaderOld
except Exception:
    PdfReaderOld = None

# --- Konfiguration via miljøvariabler ---
CLIENT_ID = (
    os.getenv("MS_CLIENT_ID")
    or os.getenv("GRAPH_CLIENT_ID")
    or os.getenv("MSAL_CLIENT_ID")
    or ""
)
TENANT_ID = (os.getenv("MS_TENANT_ID") or os.getenv("GRAPH_TENANT_ID") or "common").strip() or "common"
FORCE_DEVICE_LOGIN = str(os.getenv("MS_FORCE_DEVICE_LOGIN", "false")).lower() in ("1", "true", "yes")
AUTHORITY = f"https://login.microsoftonline.com/{TENANT_ID}"
SCOPES = ["Notes.Read"]  # Do NOT include reserved scopes (openid, profile, offline_access); MSAL/AAD handles them
GRAPH = "https://graph.microsoft.com/v1.0"

# Tip: Hvis du vil tvinge ny login-konto, så slet TOKEN_CACHE_PATH-filen (eller sæt MS_TOKEN_CACHE til en ny sti).
# Persisted token cache for silent refresh between runs (override via MS_TOKEN_CACHE)
TOKEN_CACHE_PATH = os.getenv("MS_TOKEN_CACHE", str(Path.home() / ".cache" / "msal_onenote_token.bin"))
Path(TOKEN_CACHE_PATH).parent.mkdir(parents=True, exist_ok=True)

MAX_PAGES = int(os.getenv("MAX_PAGES", "10"))
TARGET_SECTION_NAME = os.getenv("TARGET_SECTION_NAME", "Funktioner af to variable")
TARGET_NOTEBOOK_NAME = os.getenv("TARGET_NOTEBOOK_NAME")  # None = ingen filter
# Brug et OneNote-link (sharepoint/onenote web link) eller et rent section-id/page-id via miljøvariabler
TARGET_LINK = os.getenv(
    "TARGET_LINK",
    os.getenv(
        "TARGET_SECTION_ID",
        "",
    )
)

# Eksport-mode
EXPORT_WHOLE_SECTION = str(os.getenv("EXPORT_WHOLE_SECTION", "true")).lower() in ("1", "true", "yes")

# OCR-cache: gem OCR-resultater pr. billede-hash så genkørsler bliver hurtige
ENABLE_OCR_CACHE = str(os.getenv("ENABLE_OCR_CACHE", "true")).lower() in ("1", "true", "yes")
SITE_HINT = os.getenv("SITE_HINT")  # fx "favrskovgymnasium.sharepoint.com/sites/Section_4992"

# --- Konfigurations-switch: kun hent præcis side hvis page-id er angivet ---
EXACT_PAGE_ONLY = str(os.getenv("EXACT_PAGE_ONLY", "true")).lower() in ("1", "true", "yes")

# --- OCR konfiguration ---
ENABLE_OCR = str(os.getenv("ENABLE_OCR", "true")).lower() in ("1", "true", "yes")
OCR_LANG = os.getenv("OCR_LANG", "dan+eng")
SAVE_IMAGES = str(os.getenv("SAVE_IMAGES", "true")).lower() in ("1", "true", "yes")
ENABLE_ATTACHMENTS = str(os.getenv("ENABLE_ATTACHMENTS", "true")).lower() in ("1", "true", "yes")
SAVE_ATTACHMENTS = str(os.getenv("SAVE_ATTACHMENTS", "true")).lower() in ("1", "true", "yes")
MAX_ATTACHMENT_TEXT_CHARS = int(os.getenv("MAX_ATTACHMENT_TEXT_CHARS", "12000"))

# --- Vision captioning (multimodal) via Ollama ---
ENABLE_IMAGE_CAPTIONS = str(os.getenv("ENABLE_IMAGE_CAPTIONS", "false")).lower() in ("1", "true", "yes")
VISION_MODEL = os.getenv("VISION_MODEL", "llava")
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
CAPTION_MAX_CHARS = int(os.getenv("CAPTION_MAX_CHARS", "800"))

# OCR via OpenAI vision (fallback or forced)
OCR_PROVIDER = str(os.getenv("OCR_PROVIDER", "auto") or "auto").strip().lower()  # auto|tesseract|openai
OPENAI_API_KEY = str(os.getenv("OPENAI_API_KEY", "") or "").strip()
OPENAI_API_BASE = (os.getenv("OPENAI_API_BASE") or os.getenv("OPENAI_BASE_URL") or "https://api.openai.com").strip().rstrip("/")
OPENAI_OCR_MODEL = str(os.getenv("OPENAI_OCR_MODEL", "gpt-4o-mini") or "gpt-4o-mini").strip()
OPENAI_OCR_MAX_CHARS = int(os.getenv("OPENAI_OCR_MAX_CHARS", "12000"))

# Output directory support (for integration fra AI_NOTER.py)
OUTPUT_DIR = os.getenv("OUTPUT_DIR")
OUTDIR = Path(OUTPUT_DIR) if OUTPUT_DIR else None
AUTH_ONLY = str(os.getenv("MS_AUTH_ONLY", "false")).lower() in ("1", "true", "yes")

# Udtræk SITE_HINT fra TARGET_LINK hvis muligt og ikke allerede sat
if SITE_HINT is None and TARGET_LINK:
    m = re.search(r"https://([^/]+/sites/[^/]+)", TARGET_LINK)
    if m:
        SITE_HINT = m.group(1).lower()

if not CLIENT_ID:
    print("MS_CLIENT_ID mangler i miljøvariabler. Brug din egen Azure App Registration client id.", file=sys.stderr)
    sys.exit(1)

# --- Auth ---
def acquire_token_device_code():
    # Use a persistent cache so we can acquire tokens silently on subsequent runs
    cache = msal.SerializableTokenCache()
    try:
        if os.path.exists(TOKEN_CACHE_PATH):
            cache.deserialize(open(TOKEN_CACHE_PATH, "r", encoding="utf-8").read())
    except Exception:
        # Corrupt cache should not block auth; start fresh
        cache = msal.SerializableTokenCache()

    app = msal.PublicClientApplication(client_id=CLIENT_ID, authority=AUTHORITY, token_cache=cache)

    # Try silent first (unless forced device login)
    if not FORCE_DEVICE_LOGIN:
        accounts = app.get_accounts()
        if accounts:
            result = app.acquire_token_silent(SCOPES, account=accounts[0])
            if result and "access_token" in result:
                # Save cache if changed
                if cache.has_state_changed:
                    with open(TOKEN_CACHE_PATH, "w", encoding="utf-8") as f:
                        f.write(cache.serialize())
                return result
    else:
        print("MS_FORCE_DEVICE_LOGIN=true: springer cached token over og kræver nyt login.")

    # Device Code Flow
    # Bemærk: MSAL afviser 'offline_access' i scopes-listen som 'reserved'; refresh tokens håndteres automatisk.
    flow = app.initiate_device_flow(scopes=SCOPES)
    if "user_code" not in flow:
        raise RuntimeError("Kunne ikke starte device code flow. Tjek app-indstillinger og scopes.")

    print("Log ind ved at åbne URL’en og indtaste koden:", flush=True)
    print(flow.get("verification_uri") or flow.get("verification_uri_complete"), flush=True)
    print("Kode:", flow["user_code"], flush=True)  # kopier denne kode ind på siden ovenfor

    result = app.acquire_token_by_device_flow(flow)  # blokerer til login er fuldført
    if "access_token" not in result:
        raise RuntimeError(f"Login mislykkedes: {result}")

    # Persist the refreshed cache
    if cache.has_state_changed:
        with open(TOKEN_CACHE_PATH, "w", encoding="utf-8") as f:
            f.write(cache.serialize())
    return result

# --- Graph helpers ---
def graph_get(url, token, params=None, accept_html=False):
    headers = {"Authorization": f"Bearer {token}"}
    if accept_html:
        headers["Accept"] = "text/html"
    else:
        headers["Accept"] = "application/json"
    r = requests.get(url, headers=headers, params=params, timeout=60)
    if not r.ok:
        raise RuntimeError(f"Graph GET fejlede {r.status_code}: {r.text[:400]}")
    return r

def try_get_section(token, section_id):
    url = f"{GRAPH}/me/onenote/sections/{section_id}"
    headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
    r = requests.get(url, headers=headers, timeout=60)
    return r.ok, (r.json() if r.headers.get("Content-Type","" ).startswith("application/json") else {})

# --- SharePoint Site helpers for read-only class notebooks ---
def resolve_site_id(token, site_hint: str) -> str:
    """Resolve a SharePoint site id from a hint like 'host/sites/Section_XXXX'."""
    hint = site_hint.strip()
    if "/" not in hint:
        raise ValueError("SITE_HINT skal være 'host/sites/yourSite' (uden https://)")
    host, path = hint.split("/", 1)
    url = f"{GRAPH}/sites/{host}:/{'/' + path if not path.startswith('/') else path}"
    r = graph_get(url, token)
    return r.json().get("id")

def list_site_notebooks(token, site_id, top=200):
    url = f"{GRAPH}/sites/{site_id}/onenote/notebooks"
    r = graph_get(url, token, params={"$top": top})
    data = r.json()
    return data.get("value", [])

def list_site_sections(token, site_id, top=300):
    url = f"{GRAPH}/sites/{site_id}/onenote/sections"
    r = graph_get(url, token, params={"$top": top})
    data = r.json()
    return data.get("value", [])

def try_get_site_section(token, site_id, section_id):
    url = f"{GRAPH}/sites/{site_id}/onenote/sections/{section_id}"
    headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
    r = requests.get(url, headers=headers, timeout=60)
    return r.ok, (r.json() if r.headers.get("Content-Type", "").startswith("application/json") else {})

def list_pages_in_site_section(token, site_id, section_id, top=50):
    url = f"{GRAPH}/sites/{site_id}/onenote/sections/{section_id}/pages"
    r = graph_get(url, token, params={"$top": top})
    data = r.json()
    return data.get("value", [])

# --- Helper: paginate all pages in a site-scoped section ---
def list_pages_in_site_section_all(token, site_id, section_id, page_size=100, hard_limit=5000):
    """Returner alle sider i en sektion (site-scope) ved at følge @odata.nextLink.
    Stopper når hard_limit er nået eller der ikke er flere sider.
    """
    pages = []
    url = f"{GRAPH}/sites/{site_id}/onenote/sections/{section_id}/pages"
    params = {"$top": page_size, "$orderby": "lastModifiedDateTime desc"}
    while url and len(pages) < hard_limit:
        r = graph_get(url, token, params=params)
        data = r.json()
        pages.extend(data.get("value", []))
        url = data.get("@odata.nextLink")
        params = None  # nextLink indeholder allerede query
    return pages

# --- Helper: list resources for a OneNote page (prefer this over scraping <img> tags) ---
def list_page_resources(token: str, page_id: str, site_id: str | None = None, top: int = 200):
    """Return OneNote page resources via Graph. Prefer site scope when site_id given."""
    if site_id:
        url = f"{GRAPH}/sites/{site_id}/onenote/pages/{page_id}/resources"
    else:
        url = f"{GRAPH}/me/onenote/pages/{page_id}/resources"
    print(f"[DEBUG] list_page_resources -> {url}")
    r = graph_get(url, token, params={"$top": top})
    data = r.json()
    print(f"[DEBUG] resources count: {len(data.get('value', []))}")
    return data.get("value", [])


# Hjælpefunktioner til sektioner og sider i sektioner
def list_sections(token, top=100):
    url = f"{GRAPH}/me/onenote/sections"
    r = graph_get(url, token, params={"$top": top})
    data = r.json()
    sections = data.get("value", [])
    for i, s in enumerate(sections, 1):
        nb = (s.get("parentNotebook") or {}).get("displayName")
        print(f"Sektion {i:02d}. {s.get('displayName')}  |  id={s.get('id')}  |  notebook={nb}")
    return sections

# Udtræk sektion-id fra link eller rå id
def extract_section_id(section_link_or_id: str) -> tuple[str, str]:
    """Accepter enten en ren sektion-id eller en fuld OneNote-link og udtræk id'et.
    Matcher 'section-id=...' i links af typen 'onenote:https://...#...&section-id={id}&end'."""
    s = section_link_or_id.strip()
    if not s:
        return "", ""
    # hvis det ligner et rå-id (starter ofte med 1- og indeholder guid)
    if s.startswith("1-") and "-" in s and " " not in s:
        return s, ""
    # ellers forsøg at finde section-id=...
    m = re.search(r"section-id=([^&#]+)", s, flags=re.IGNORECASE)
    if m:
        val = unquote(m.group(1))
        # fjern evt. krøllede klammer {..} fra GUID'er i delte links
        if val.startswith("{") and val.endswith("}"):
            val = val[1:-1]
        return val, val

    # SharePoint Doc.aspx fallback:
    # .../Doc.aspx?...&wd=target(...|B8BDFA90-D71C-43E2-9026-DF6D845BE89D/)...&...
    # Udtræk GUID efter pipe i wd=target(...)
    try:
        s_url = s[8:] if s.lower().startswith("onenote:") else s
        parsed = urlparse(s_url)
        qs = parse_qs(parsed.query or "")
        wd_values = qs.get("wd", [])
        for wd in wd_values:
            wd_dec = unquote(wd or "")
            m_wd = re.search(r"\|([0-9a-fA-F-]{36})/", wd_dec)
            if m_wd:
                gid = m_wd.group(1)
                return gid, gid
    except Exception:
        pass
    return s, ""  # sidste udvej: returnér som er

# Udtræk page-id fra link eller rå id
def extract_page_id(page_link_or_id: str) -> tuple[str, str]:
    """Accepts either a raw page-id or a full OneNote link and extracts the page id.
    Returns (raw_or_prefixed, guid_hint). If a plain GUID is found, guid_hint is that GUID (without braces).
    """
    s = (page_link_or_id or "").strip()
    if not s:
        return "", ""
    # if it already looks like a OneNote id (often starts with 1- and contains a guid), keep it
    if s.startswith("1-") and " " not in s:
        return s, ""
    # else try to find page-id=...
    m = re.search(r"page-id=([^&#]+)", s, flags=re.IGNORECASE)
    if m:
        val = unquote(m.group(1))
        if val.startswith("{") and val.endswith("}"):
            val = val[1:-1]
        return val, val  # raw guid (without prefix), and guid as hint
    return "", ""

# Udtræk en menneskelig titel-hint fra OneNote-linkets fragment (mellem # og første &)
def extract_title_hint(link: str) -> str:
    if not link:
        return ""
    m = re.search(r"#([^&]+)", link)
    if not m:
        return ""
    try:
        return unquote(m.group(1)).strip()
    except Exception:
        return m.group(1).strip()

# Udtræk sektionens navn fra OneNote-linkets sti: .../Some%20Name.one#... → 'Some Name'
def extract_section_name_hint(link: str) -> str:
    """Udtræk sektionens navn fra OneNote-linkets sti: .../Some%20Name.one#... → 'Some Name'"""
    if not link:
        return ""
    m = re.search(r"/([^/]+)\.one(?:[#?]|$)", link)
    if not m:
        # SharePoint Doc.aspx fallback: wd=target(.../Some%20Name.one|GUID/...)
        try:
            s_url = link[8:] if link.lower().startswith("onenote:") else link
            parsed = urlparse(s_url)
            wd_values = parse_qs(parsed.query or "").get("wd", [])
            for wd in wd_values:
                wd_dec = unquote(wd or "")
                m2 = re.search(r"/([^/|]+)\.one\|", wd_dec)
                if m2:
                    return unquote(m2.group(1)).strip()
        except Exception:
            pass
        return ""
    try:
        return unquote(m.group(1)).strip()
    except Exception:
        return m.group(1).strip()

# --- NYE HJÆLPEFUNKTIONER ---
def list_notebooks(token, top=200):
    url = f"{GRAPH}/me/onenote/notebooks"
    r = graph_get(url, token, params={"$top": top})
    data = r.json()
    books = data.get("value", [])
    for i, nb in enumerate(books, 1):
        web = ((nb.get("links") or {}).get("oneNoteWebUrl") or {}).get("href")
        print(f"Notesbog {i:02d}. {nb.get('displayName')} | id={nb.get('id')} | web={web}")
    return books

def list_sections_in_notebook(token, notebook_id, top=200):
    url = f"{GRAPH}/me/onenote/notebooks/{notebook_id}/sections"
    r = graph_get(url, token, params={"$top": top})
    data = r.json()
    return data.get("value", [])

def list_pages_in_section(token, section_id, top=50):
    url = f"{GRAPH}/me/onenote/sections/{section_id}/pages"
    r = graph_get(url, token, params={"$top": top})
    data = r.json()
    return data.get("value", [])

# --- Helper: paginate all pages in a /me section ---
def list_pages_in_section_all(token, section_id, page_size=100, hard_limit=5000):
    pages = []
    url = f"{GRAPH}/me/onenote/sections/{section_id}/pages"
    params = {"$top": page_size, "$orderby": "lastModifiedDateTime desc"}
    while url and len(pages) < hard_limit:
        r = graph_get(url, token, params=params)
        data = r.json()
        pages.extend(data.get("value", []))
        url = data.get("@odata.nextLink")
        params = None
    return pages

def list_pages(token, top=10):
    url = f"{GRAPH}/me/onenote/pages"
    r = graph_get(url, token, params={"$top": top})
    data = r.json()
    pages = data.get("value", [])
    # venlig udskrift i konsollen
    for i, p in enumerate(pages, 1):
        print(f"{i:02d}. {p.get('title')}  |  id={p.get('id')}  |  created={p.get('createdDateTime')}")
    return pages


def fetch_page_content_html(token, page_id=None, site_id=None, content_url=None):
    """Fetch OneNote page HTML content.
    Priority: use content_url if provided; else use site-scoped endpoint when site_id is given; else fall back to /me.
    """
    headers = {"Authorization": f"Bearer {token}", "Accept": "text/html"}
    if content_url:
        url = content_url
    elif site_id:
        url = f"{GRAPH}/sites/{site_id}/onenote/pages/{page_id}/content"
    else:
        url = f"{GRAPH}/me/onenote/pages/{page_id}/content"
    r = requests.get(url, headers=headers, timeout=60)
    if not r.ok:
        raise RuntimeError(f"Graph GET fejlede {r.status_code}: {r.text[:400]}")
    return r.text

# --- Meget simpel parser: HTML -> tekst + naive begreber ---
def simplify_html_to_text(html):
    soup = BeautifulSoup(html, "lxml")
    lines = []
    concepts = set()

    for tag in soup.find_all(["h1", "h2", "h3", "p", "li"]):
        txt = " ".join(tag.get_text(separator=" ", strip=True).split())
        if not txt:
            continue
        name = tag.name.lower()
        if name in ("h1", "h2", "h3"):
            lines.append(f"# {txt}")
        else:
            lines.append(txt)
        # naive begrebs-kandidater: ord med stor forbogstav og >2 tegn
        for token in txt.split():
            if token.istitle() and len(token) > 2:
                concepts.add(token)

    plain = "\n".join(lines)
    candidates = sorted(concepts)
    return plain, candidates


# --- OCR Hjælpefunktioner ---
def _looks_like_image_url(url: str) -> bool:
    u = (url or "").lower()
    return bool(re.search(r"\.(png|jpe?g|gif|bmp|webp|tiff?|svg)(?:$|[?#])", u))


def _looks_like_attachment_url(url: str) -> bool:
    u = (url or "").lower()
    return bool(re.search(r"\.(pdf|docx?|pptx?|xlsx?|xls|txt|rtf|csv|md|json|xml)(?:$|[?#])", u))


def extract_image_urls_from_html(html: str) -> list[str]:
    """Find billede-URLs i OneNote HTML. Søger i src/data-src/data-fullres-src + relevante object[data]."""
    soup = BeautifulSoup(html, "lxml")
    urls = []

    def add(u):
        if u and isinstance(u, str):
            u = u.strip()
            if u and u not in urls:
                urls.append(u)

    # Klassiske <img src>
    for img in soup.find_all("img"):
        # OneNote giver ofte både preview og full-res; tag full-res først
        add(img.get("data-fullres-src"))
        add(img.get("src"))
        add(img.get("data-src"))

    # OneNote kan også lægge billeder som <object data=...>, men skip rene fil-vedhæftninger.
    for obj in soup.find_all("object"):
        u = (obj.get("data") or "").strip()
        t = (obj.get("type") or "").lower().strip()
        has_attachment_name = bool((obj.get("data-attachment") or "").strip())
        if not u:
            continue
        if has_attachment_name:
            continue
        if t.startswith("image/") or _looks_like_image_url(u):
            add(u)

    return urls


def extract_attachment_objects_from_html(html: str) -> list[dict]:
    """Find file attachments embedded as <object data-attachment=... data=...>."""
    soup = BeautifulSoup(html, "lxml")
    out = []
    seen = set()

    for obj in soup.find_all("object"):
        name = (obj.get("data-attachment") or "").strip()
        url = (obj.get("data") or "").strip()
        ctype = (obj.get("type") or "").strip()
        if not url:
            continue

        is_attachment = bool(name)
        if not is_attachment:
            # Fallback heuristics for objects without data-attachment attr
            if ctype and not ctype.lower().startswith("image/"):
                is_attachment = True
            elif _looks_like_attachment_url(url) and not _looks_like_image_url(url):
                is_attachment = True

        if not is_attachment:
            continue

        key = (name.lower(), url)
        if key in seen:
            continue
        seen.add(key)
        out.append({"name": name, "url": url, "content_type": ctype})

    return out


def guess_ext_from_mime(content_type: str) -> str:
    c = (content_type or "").lower()
    mapping = {
        "application/pdf": ".pdf",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
        "application/msword": ".doc",
        "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
        "text/plain": ".txt",
        "text/csv": ".csv",
        "application/rtf": ".rtf",
    }
    return mapping.get(c, "")


def normalize_text_block(text: str) -> str:
    s = (text or "").replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"[ \t]+\n", "\n", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()


def extract_text_from_docx_bytes(data: bytes) -> str:
    texts: list[str] = []
    try:
        with zipfile.ZipFile(io.BytesIO(data)) as zf:
            names = set(zf.namelist())
            xml_paths = ["word/document.xml"] + sorted(
                [n for n in names if (n.startswith("word/header") or n.startswith("word/footer")) and n.endswith(".xml")]
            )
            ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
            for xml_name in xml_paths:
                if xml_name not in names:
                    continue
                try:
                    root = ET.fromstring(zf.read(xml_name))
                except Exception:
                    continue
                for p in root.findall(".//w:p", ns):
                    runs = []
                    for t in p.findall(".//w:t", ns):
                        if t.text:
                            runs.append(t.text)
                    line = "".join(runs).strip()
                    if line:
                        texts.append(line)
    except Exception:
        return ""
    return normalize_text_block("\n".join(texts))


def extract_text_from_pdf_bytes(data: bytes) -> str:
    readers = []
    if PdfReaderNew:
        readers.append(PdfReaderNew)
    if PdfReaderOld and PdfReaderOld not in readers:
        readers.append(PdfReaderOld)

    for reader_cls in readers:
        try:
            reader = reader_cls(io.BytesIO(data))
            parts = []
            for page in reader.pages:
                txt = page.extract_text() or ""
                if txt.strip():
                    parts.append(txt)
            text = normalize_text_block("\n\n".join(parts))
            if text:
                return text
        except Exception:
            continue
    return ""


def extract_text_from_attachment_bytes(data: bytes, filename: str, content_type: str) -> str:
    ext = Path(filename or "").suffix.lower()
    ctype = (content_type or "").lower()

    if ext == ".docx" or "wordprocessingml.document" in ctype:
        return extract_text_from_docx_bytes(data)
    if ext == ".pdf" or "application/pdf" in ctype:
        return extract_text_from_pdf_bytes(data)
    if ext in {".txt", ".md", ".csv", ".json", ".xml", ".html", ".htm"} or ctype.startswith("text/"):
        try:
            txt = data.decode("utf-8", errors="ignore")
        except Exception:
            return ""
        if ext in {".html", ".htm"}:
            try:
                txt = BeautifulSoup(txt, "lxml").get_text(separator="\n")
            except Exception:
                pass
        return normalize_text_block(txt)
    return ""


def extract_attachments_from_page_html_to_dir(
    token: str,
    html: str,
    site_id: str | None,
    export_root: Path,
    page_id_safe: str,
) -> tuple[str, int, int, list[str], list[dict]]:
    entries = extract_attachment_objects_from_html(html)
    found = len(entries)
    with_text = 0
    saved_paths: list[str] = []
    items: list[dict] = []

    if not entries:
        block = "[ATTACHMENTS]\nFound attachments: 0 | Attachments with extracted text: 0"
        return block, found, with_text, saved_paths, items

    att_dir = export_root / "attachments"
    att_dir.mkdir(parents=True, exist_ok=True)

    block_lines = ["[ATTACHMENTS]"]
    block_lines.append(f"Found attachments: {found} | Attachments with extracted text: 0")

    for idx, entry in enumerate(entries, start=1):
        name_raw = (entry.get("name") or "").strip()
        url_raw = (entry.get("url") or "").strip()
        ctype = (entry.get("content_type") or "").strip()

        url = normalize_graph_url(url_raw, site_id)
        if not name_raw:
            # fallback basename from URL path
            try:
                name_raw = unquote(Path(url.split("?", 1)[0]).name)
            except Exception:
                name_raw = f"attachment_{idx}"

        ext = Path(name_raw).suffix.lower()
        if not ext:
            ext = guess_ext_from_mime(ctype)

        stem = safe_filename(Path(name_raw).stem)[:80] or f"attachment_{idx}"
        filename = f"{page_id_safe}_{idx}_{stem}{ext}"
        out_file = att_dir / filename

        text = ""
        saved_rel = ""
        error_msg = ""

        try:
            blob = fetch_resource_bytes(token, url)

            if SAVE_ATTACHMENTS:
                with open(out_file, "wb") as f:
                    f.write(blob)
                try:
                    saved_rel = str(out_file.relative_to(export_root))
                except Exception:
                    saved_rel = str(out_file)
                saved_paths.append(str(out_file))

            text = extract_text_from_attachment_bytes(blob, filename=name_raw, content_type=ctype)
            if MAX_ATTACHMENT_TEXT_CHARS and len(text) > MAX_ATTACHMENT_TEXT_CHARS:
                text = text[:MAX_ATTACHMENT_TEXT_CHARS].rstrip() + "\n...[truncated]..."
            text = normalize_text_block(text)
            if text:
                with_text += 1
        except Exception as ex:
            error_msg = str(ex)

        items.append(
            {
                "attachment_name": name_raw,
                "content_type": ctype,
                "source_url": url_raw,
                "normalized_url": url,
                "attachment_path": saved_rel or None,
                "text_extracted": bool(text),
                "error": error_msg or None,
            }
        )

        block_lines.append(f"- {name_raw}")
        if saved_rel:
            block_lines.append(f"  FILE: {saved_rel}")
        if ctype:
            block_lines.append(f"  TYPE: {ctype}")
        if text:
            block_lines.append("  EXTRACTED_TEXT:")
            for ln in text.splitlines():
                block_lines.append("    " + ln)
        elif error_msg:
            block_lines.append(f"  NOTE: extraction failed ({error_msg[:180]})")
        else:
            block_lines.append("  NOTE: no text extracted")

    # Update summary with final with_text count
    block_lines[1] = f"Found attachments: {found} | Attachments with extracted text: {with_text}"
    block = "\n".join(block_lines).strip()
    return block, found, with_text, saved_paths, items

def normalize_graph_url(img_src: str, site_id: str | None = None) -> str:
    """Normaliser img src til en kaldbar URL.
    - Bevar fulde https:// URLs (Graph eller SharePoint). Bemærk: SharePoint-hosts kræver cookie, ikke bearer-token → de vil oftest fejle her.
    - OneNote Graph resource-URLs kan være både /me/onenote/resources/... og /sites/{id}/onenote/resources/...
      Hvis vi har et site_id, forsøger vi at tvinge site-varianten.
    - Relative stier (starter med /) antages at være Graph v1.0-stier.
    """
    if not img_src:
        return img_src

    s = img_src.strip()
    # Hvis absolut http(s)
    if s.startswith("http://") or s.startswith("https://"):
        # Hvis det allerede er en Graph-URL mod /onenote/resources/ og vi har et site_id, tving site-stien
        # Eksempler: https://graph.microsoft.com/v1.0/me/onenote/resources/{rid}/content
        #            https://graph.microsoft.com/v1.0/onenote/resources/{rid}/content (sjældnere)
        m = re.search(r"https://graph\.microsoft\.com/v1\.0/(?:me/)?onenote/resources/([^/]+)/content", s)
        if m and site_id:
            rid = m.group(1)
            return f"{GRAPH}/sites/{site_id}/onenote/resources/{rid}/content"

        # OneNote HTML i class notebooks bruger ofte siteCollections-varianten med /$value
        # Eksempel:
        # https://graph.microsoft.com/v1.0/siteCollections/<SITEID>/onenote/resources/<RID>/$value
        m2 = re.search(r"https://graph\.microsoft\.com/v1\.0/siteCollections/[^/]+/onenote/resources/([^/]+)/\$value",
                       s)
        if m2 and site_id:
            rid = m2.group(1)
            return f"{GRAPH}/sites/{site_id}/onenote/resources/{rid}/content"

        return s

    # Relative Graph-sti (starter med /)
    if s.startswith("/"):
        # Kan være /me/onenote/resources/{rid}/content eller /onenote/resources/{rid}/content
        m = re.search(r"/v1\.0/(?:me/)?onenote/resources/([^/]+)/content", s)
        if m and site_id:
            rid = m.group(1)
            return f"{GRAPH}/sites/{site_id}/onenote/resources/{rid}/content"
        # ellers prefix med v1.0
        if s.startswith("/v1.0/"):
            return "https://graph.microsoft.com" + s
        return GRAPH + s

    # Fallback: returner som er
    return s

def fetch_resource_bytes(token: str, url: str) -> bytes:
    """Fetch a resource while preserving Authorization across redirects.

    requests will often drop Authorization on cross-host redirects; we follow redirects manually.
    """
    headers = {"Authorization": f"Bearer {token}", "Accept": "*/*"}

    current = url
    for _ in range(6):  # follow up to 6 redirects manually
        r = requests.get(current, headers=headers, timeout=(15, 60), allow_redirects=False)
        # DEBUG: hvis vi får auth-fejl på første hop, print lidt respons
        try:
            if _ == 0 and r.status_code in (401, 403):
                print(f"[DEBUG] fetch_resource_bytes status={r.status_code} url={current}")
                print(f"[DEBUG] response head: {r.text[:200]}")
        except Exception:
            pass
        if 300 <= r.status_code < 400 and "Location" in r.headers:
            current = r.headers["Location"]
            continue
        if not r.ok:
            raise RuntimeError(f"Resource fetch failed {r.status_code}: {r.text[:200]}")
        return r.content

    raise RuntimeError("Resource fetch failed: too many redirects")

def preprocess_for_ocr(img: Image.Image) -> Image.Image:
    try:
        # Konverter til gråtoner
        g = ImageOps.grayscale(img)
        # Let kontrasthævning via autokontrast
        g = ImageOps.autocontrast(g)
        # Hvis billedet er lille, opskalér x2 for at hjælpe Tesseract
        w, h = g.size
        if w < 1200 and h < 1200:
            try:
                g = g.resize((w * 2, h * 2), resample=Image.LANCZOS)
            except Exception:
                g = g.resize((w * 2, h * 2))
        # Let udglatning mod støj
        g = g.filter(ImageFilter.MedianFilter(size=3))
        return g
    except Exception:
        return img

def openai_ocr_enabled() -> bool:
    if OCR_PROVIDER == "tesseract":
        return False
    return bool(OPENAI_API_KEY)

def openai_vision_ocr(image_bytes: bytes) -> str:
    if not openai_ocr_enabled():
        return ""
    img_b64 = base64.b64encode(image_bytes).decode("utf-8")
    url = f"{OPENAI_API_BASE}/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json",
    }
    payload = {
        "model": OPENAI_OCR_MODEL,
        "messages": [
            {
                "role": "system",
                "content": (
                    "Extract all visible text from the image. "
                    "Keep original language and line breaks when possible. "
                    "Do not summarize."
                ),
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "OCR this image and return only the extracted text."},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
                ],
            },
        ],
        "temperature": 0,
    }
    try:
        r = requests.post(url, headers=headers, json=payload, timeout=(20, 120))
        if not r.ok:
            return ""
        data = r.json()
        choices = data.get("choices") if isinstance(data, dict) else None
        if not isinstance(choices, list) or not choices:
            return ""
        msg = choices[0].get("message") if isinstance(choices[0], dict) else None
        content = (msg or {}).get("content")
        text = ""
        if isinstance(content, str):
            text = content
        elif isinstance(content, list):
            parts: list[str] = []
            for part in content:
                if not isinstance(part, dict):
                    continue
                t = part.get("text")
                if isinstance(t, str) and t.strip():
                    parts.append(t.strip())
            text = "\n".join(parts).strip()
        return (text or "").strip()[:OPENAI_OCR_MAX_CHARS]
    except Exception:
        return ""

def ocr_image_bytes(data: bytes, lang: str = OCR_LANG) -> str:
    # Forced OpenAI OCR
    if OCR_PROVIDER == "openai":
        return openai_vision_ocr(data)

    text = ""
    # Default/forced tesseract OCR
    if pytesseract and Image:
        try:
            img = Image.open(io.BytesIO(data))
            img = preprocess_for_ocr(img)
            # Første forsøg
            config = "--oem 1 --psm 3"
            text = pytesseract.image_to_string(img, lang=lang, config=config).strip()
            if not text:
                # Alternativ layout-antagelse (enkelt tekstlinje/tekstblok)
                for psm in (6, 7, 11, 12, 13):
                    config2 = f"--oem 1 --psm {psm}"
                    text = pytesseract.image_to_string(img, lang=lang, config=config2).strip()
                    if text:
                        break
        except Exception:
            text = ""

    # Auto fallback to OpenAI when tesseract not available or no text found
    if not text and openai_ocr_enabled():
        text = openai_vision_ocr(data)
    return text


# --- OCR aggregator: use Graph resources endpoint if possible ---
# --- OCR aggregator: use Graph resources endpoint if possible ---


# Helper: build stable Graph resource content URL
def graph_resource_content_url(resource_item: dict, site_id: str | None) -> str | None:
    """Build a stable Graph URL for a OneNote resource's binary content.

    IMPORTANT: Use the resource item's `id` (not `self`) to build /onenote/resources/{id}/content.
    """
    rid = resource_item.get("id")
    if not rid:
        # Sometimes we only get a self/@odata.id URL — extract the last path segment
        self_url = resource_item.get("self") or resource_item.get("@odata.id") or ""
        m = re.search(r"/onenote/resources/([^/]+)$", str(self_url))
        if m:
            rid = m.group(1)
    if not rid:
        return None

    if site_id:
        return f"{GRAPH}/sites/{site_id}/onenote/resources/{rid}/content"
    return f"{GRAPH}/me/onenote/resources/{rid}/content"

def ocr_all_images_in_page_html(token: str, html: str, site_id: str | None, save_prefix: str | None = None, page_graph_id: str | None = None) -> tuple[str, int, int, list[str]]:
    """OCR alle billeder i siden. Først prøv Graph resources-listen; fald tilbage til <img>-URLs i HTML.
    Returnerer (samlet_tekst, antal_billeder_fundet, antal_med_tekst, gemte_stier).
    """
    texts = []
    saved_paths = []
    found = 0
    with_text = 0

    urls = []
    # 1) Prøv Graph resources (mest stabile)
    if page_graph_id:
        try:
            res = list_page_resources(token, page_graph_id, site_id, top=500)
            print(f"[DEBUG] resources returned: {len(res)}")
            for item in res:
                ctype = (item.get("contentType") or "").lower()
                rid_url = graph_resource_content_url(item, site_id)

                # DEBUG: se hvad vi får tilbage
                print(f"[DEBUG] resource type: {ctype}")

                if ctype and not ctype.startswith("image/"):
                    continue
                # Foretræk stabil Graph /resources/{id}/content URL hvis muligt
                if rid_url:
                    urls.append(rid_url)

        except Exception as ex:
            print(f"[DEBUG] resource fetch error: {ex}")

    # 2) Fallback: parse HTML <img>/<object>
    if not urls:
        urls = extract_image_urls_from_html(html)
        print(f"[DEBUG] html image urls: {len(urls)}")

    found = len(urls)
    for idx, u in enumerate(urls, start=1):
        url = normalize_graph_url(u, site_id)
        try:
            blob = fetch_resource_bytes(token, url)
            if SAVE_IMAGES and save_prefix:
                # gætværk på filtype
                ext = ".png"
                head = blob[:12]
                if head.startswith(b"\xff\xd8"):
                    ext = ".jpg"
                elif head.startswith(b"GIF8"):
                    ext = ".gif"
                img_dir = (OUTDIR / "images") if OUTDIR else Path(os.getcwd()) / "images"
                img_dir.mkdir(parents=True, exist_ok=True)
                path = img_dir / f"{save_prefix}_{idx}{ext}"
                with open(path, "wb") as f:
                    f.write(blob)
                saved_paths.append(str(path))
            txt = ocr_image_bytes(blob)
            if txt:
                texts.append(txt)
                with_text += 1
        except Exception:
            continue

    return ("\n\n".join(texts).strip(), found, with_text, saved_paths)


def safe_filename(name: str) -> str:
    return "".join(c for c in name if c.isalnum() or c in (" ", "_", "-")).rstrip()

def out_path(filename: str) -> Path:
    p = Path(filename)
    return (OUTDIR / p.name) if OUTDIR else p


def write_json(path: Path, obj) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    import json
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)


def write_text(path: Path, text: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(text or "")


def sha256_bytes(data: bytes) -> str:
    import hashlib
    h = hashlib.sha256()
    h.update(data)
    return h.hexdigest()


# --- Vision captioning helpers ---
def ollama_pull_model(model: str) -> None:
    """Pull a model so end-users don't need to run commands manually."""
    try:
        print(f"Ollama: prøver at hente modellen '{model}' (ollama pull)...")
        subprocess.run(["ollama", "pull", model], check=True)
    except FileNotFoundError:
        raise RuntimeError("Kunne ikke finde 'ollama' kommandoen. Installer Ollama og sørg for at 'ollama' ligger i PATH.")
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Kunne ikke pull'e modellen '{model}'. Ollama exit code: {e.returncode}")


def caption_with_cache(image_bytes: bytes, cache_dir: Path, model: str, host: str) -> str:
    """Generate a short Danish caption for an image using a vision model, cached by sha256."""
    h = sha256_bytes(image_bytes)
    cache_path = cache_dir / f"{h}.caption.txt"
    if cache_path.exists():
        try:
            return cache_path.read_text(encoding="utf-8").strip()
        except Exception:
            pass

    caption = ollama_vision_caption(image_bytes, model=model, host=host)
    try:
        write_text(cache_path, caption)
    except Exception:
        pass
    return caption


def ollama_vision_caption(image_bytes: bytes, model: str, host: str) -> str:
    """Call Ollama /api/chat with an image to get a structured caption.

    We keep captions short and retrieval-friendly.
    """
    url = f"{host.rstrip('/')}/api/chat"
    img_b64 = base64.b64encode(image_bytes).decode("utf-8")

    system = (
        "Du er en hjælpsom assistent, der beskriver grafer/billeder kort og præcist til brug i noter. "
        "Skriv på dansk. Gæt ikke på tal du ikke kan se."
    )
    user = (
        "Lav en kort, struktureret beskrivelse af billedet.\n"
        "- Hvad er det for en type figur (graf, tabel, diagram, tekst, foto)?\n"
        "- Hvilke variable/akser/enheder kan du se?\n"
        "- Hvad er hovedtendensen (stiger/falder/sammenligning), hvis det er en graf?\n"
        "Hold det under 8-12 linjer."
    )

    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": user, "images": [img_b64]},
        ],
        "stream": False,
        "options": {"temperature": 0.1},
    }

    r = requests.post(url, json=payload, timeout=240)
    if r.ok:
        data = r.json()
        text = (data.get("message") or {}).get("content") or ""
        text = text.strip()
        if CAPTION_MAX_CHARS and len(text) > CAPTION_MAX_CHARS:
            text = text[:CAPTION_MAX_CHARS].rstrip() + "…"
        return text

    # Auto-fix: model missing
    if r.status_code == 404 and "model" in r.text and "not found" in r.text:
        ollama_pull_model(model)
        r2 = requests.post(url, json=payload, timeout=240)
        if r2.ok:
            data = r2.json()
            text = (data.get("message") or {}).get("content") or ""
            text = text.strip()
            if CAPTION_MAX_CHARS and len(text) > CAPTION_MAX_CHARS:
                text = text[:CAPTION_MAX_CHARS].rstrip() + "…"
            return text
        raise RuntimeError(f"Ollama vision chat fejlede efter pull: {r2.status_code} {r2.text[:200]}")

    raise RuntimeError(f"Ollama vision chat fejlede: {r.status_code} {r.text[:200]}")

def find_existing_export_root(
    section_id: str,
    site_id: str | None,
    search_dir: Path,
    section_name_hint: str | None = None,
) -> Path | None:
    """Find newest reusable export_* folder for a section/site.

    Primary match: manifest section_id + site_id.
    Fallback: in-progress folder name containing section_name_hint when manifest is missing.
    """
    def norm_id(v: str | None) -> str:
        s = str(v or "").strip().lower()
        if s.startswith("{") and s.endswith("}"):
            s = s[1:-1]
        if s.startswith("1-"):
            s = s[2:]
        return s

    def norm_site(v: str | None) -> str:
        return str(v or "").strip().lower()

    try:
        if not search_dir.exists() or not search_dir.is_dir():
            return None

        candidates: list[Path] = []
        for p in search_dir.iterdir():
            if not p.is_dir() or not p.name.startswith("export_"):
                continue
            mf = p / "manifest.json"
            if not mf.exists():
                continue
            try:
                import json
                m = json.loads(mf.read_text(encoding="utf-8"))
            except Exception:
                continue

            if norm_id(m.get("section_id")) != norm_id(section_id):
                continue
            if norm_site(m.get("site_id")) != norm_site(site_id):
                continue

            candidates.append(p)

        if not candidates:
            # Fallback for interrupted exports without manifest:
            # pick newest export folder whose name includes section hint and has pages/.
            hint = safe_filename(section_name_hint or "").strip().lower()
            if hint:
                for p in search_dir.iterdir():
                    if not p.is_dir() or not p.name.startswith("export_"):
                        continue
                    if hint not in p.name.lower():
                        continue
                    if not (p / "pages").exists():
                        continue
                    candidates.append(p)

            if not candidates:
                return None

        def mtime_for(d: Path) -> float:
            mf = d / "manifest.json"
            if mf.exists():
                return mf.stat().st_mtime
            return d.stat().st_mtime

        candidates.sort(key=mtime_for, reverse=True)
        return candidates[0]
    except Exception:
        return None

def make_export_root(section_name: str) -> Path:
    ts = __import__("datetime").datetime.now().strftime("%Y%m%d_%H%M%S")
    base = safe_filename(section_name)[:60] or "section"
    root = (OUTDIR / f"export_{base}_{ts}") if OUTDIR else (Path(os.getcwd()) / f"export_{base}_{ts}")
    root.mkdir(parents=True, exist_ok=True)
    (root / "pages").mkdir(exist_ok=True)
    (root / "images").mkdir(exist_ok=True)
    (root / "attachments").mkdir(exist_ok=True)
    (root / "cache" / "ocr").mkdir(parents=True, exist_ok=True)
    return root


def ocr_with_cache(image_bytes: bytes, lang: str, cache_dir: Path) -> str:
    if not ((pytesseract and Image) or openai_ocr_enabled()):
        return ""
    h = sha256_bytes(image_bytes)
    cache_path = cache_dir / f"{h}.txt"
    if ENABLE_OCR_CACHE and cache_path.exists():
        try:
            return cache_path.read_text(encoding="utf-8").strip()
        except Exception:
            pass
    txt = ocr_image_bytes(image_bytes, lang=lang)
    if ENABLE_OCR_CACHE:
        try:
            write_text(cache_path, txt)
        except Exception:
            pass
    return txt


def ocr_all_images_in_page_html_to_dir(
    token: str,
    html: str,
    site_id: str | None,
    export_root: Path,
    page_graph_id: str | None,
    page_id_safe: str,
) -> tuple[str, int, int, list[str], list[dict]]:
    texts: list[str] = []
    saved_paths: list[str] = []
    items: list[dict] = []
    found = 0
    with_text = 0

    urls: list[str] = []
    if page_graph_id:
        try:
            res = list_page_resources(token, page_graph_id, site_id, top=500)
            for item in res:
                ctype = (item.get("contentType") or "").lower()
                if ctype and not ctype.startswith("image/"):
                    continue
                rid_url = graph_resource_content_url(item, site_id)
                if rid_url:
                    urls.append(rid_url)
        except Exception:
            pass

    if not urls:
        urls = extract_image_urls_from_html(html)

    found = len(urls)
    cache_dir = export_root / "cache" / "ocr"
    caption_cache_dir = export_root / "cache" / "captions"
    caption_cache_dir.mkdir(parents=True, exist_ok=True)
    img_dir = export_root / "images"

    for idx, u in enumerate(urls, start=1):
        url = normalize_graph_url(u, site_id)
        try:
            blob = fetch_resource_bytes(token, url)

            ext = ".png"
            head = blob[:12]
            if head.startswith(b"\xff\xd8"):
                ext = ".jpg"
            elif head.startswith(b"GIF8"):
                ext = ".gif"

            out_img = img_dir / f"{page_id_safe}_{idx}{ext}"

            rel_img = None
            if SAVE_IMAGES:
                with open(out_img, "wb") as f:
                    f.write(blob)
                saved_paths.append(str(out_img))
                try:
                    rel_img = str(out_img.relative_to(export_root))
                except Exception:
                    rel_img = str(out_img)

            txt = ""
            if ENABLE_OCR:
                txt = ocr_with_cache(blob, lang=OCR_LANG, cache_dir=cache_dir)

            cap = ""
            if ENABLE_IMAGE_CAPTIONS:
                try:
                    cap = caption_with_cache(blob, cache_dir=caption_cache_dir, model=VISION_MODEL, host=OLLAMA_HOST)
                except Exception as ex:
                    # Don't crash export if captioning fails
                    cap = ""

            if txt:
                texts.append(txt)
                with_text += 1

            items.append({
                "image_path": rel_img or str(out_img),
                "ocr_text": txt or "",
                "caption": cap or "",
            })

        except Exception:
            continue

    return ("\n\n".join(texts).strip(), found, with_text, saved_paths, items)


def main():
    # Allow overriding TARGET_LINK from CLI arg (used by Streamlit Main.py)
    global TARGET_LINK, SITE_HINT
    if len(sys.argv) > 1 and sys.argv[1].strip():
        TARGET_LINK = sys.argv[1].strip()

    # Brugere kan paste både Doc.aspx-link og onenote:-link.
    # Foretræk altid onenote:-linket, hvis det findes.
    raw = (TARGET_LINK or "").strip()
    if "onenote:" in raw.lower():
        i = raw.lower().find("onenote:")
        raw = raw[i:].strip()
    else:
        # ellers: tag første token der ligner en URL
        for tok in raw.split():
            if tok.startswith("https://") or tok.startswith("http://") or tok.lower().startswith("onenote:"):
                raw = tok.strip()
                break

    TARGET_LINK = raw

    if AUTH_ONLY:
        token_result = acquire_token_device_code()
        claims = token_result.get("id_token_claims") if isinstance(token_result, dict) else {}
        user = ""
        if isinstance(claims, dict):
            user = str(claims.get("preferred_username") or claims.get("upn") or "").strip()
        print("AUTH_ONLY=true")
        print("AUTH_OK=true")
        if user:
            print(f"AUTH_USER={user}")
        return

    if not (TARGET_LINK or "").strip():
        raise RuntimeError(
            "TARGET_LINK mangler. Indsæt et OneNote link i Streamlit, "
            "eller angiv det som første argument til scriptet."
        )

    # VIGTIGT: SITE_HINT blev udtrukket ved import-tid, men TARGET_LINK ændres først her.
    # Derfor skal vi udtrække SITE_HINT igen nu.
    if SITE_HINT is None and TARGET_LINK:
        m = re.search(r"https://([^/]+/sites/[^/]+)", TARGET_LINK)
        if m:
            SITE_HINT = m.group(1).lower()

    token_result = acquire_token_device_code()
    access_token = token_result["access_token"]
    print(f"TARGET_LINK: {TARGET_LINK}")
    print(f"SITE_HINT: {SITE_HINT}")
    print(
        "EXPORT_WHOLE_SECTION: "
        f"{EXPORT_WHOLE_SECTION} | ENABLE_OCR: {ENABLE_OCR} | SAVE_IMAGES: {SAVE_IMAGES} | "
        f"ENABLE_ATTACHMENTS: {ENABLE_ATTACHMENTS} | SAVE_ATTACHMENTS: {SAVE_ATTACHMENTS}"
    )

    if ENABLE_OCR:
        if OCR_PROVIDER == "openai":
            if openai_ocr_enabled():
                print(f"OCR aktiveret via OpenAI ({OPENAI_OCR_MODEL}).")
            else:
                print("Advarsel: OCR_PROVIDER=openai, men OPENAI_API_KEY mangler. Ingen OCR udføres.")
        elif pytesseract and Image:
            print(f"OCR aktiveret via Tesseract (sprog: {OCR_LANG}).")
            if openai_ocr_enabled():
                print(f"OpenAI OCR fallback er aktiv ({OPENAI_OCR_MODEL}) hvis Tesseract ikke finder tekst.")
        elif openai_ocr_enabled():
            print(f"OCR aktiveret via OpenAI fallback ({OPENAI_OCR_MODEL}), da Tesseract/PIL mangler.")
        else:
            print("Advarsel: ENABLE_OCR=True, men hverken Tesseract/PIL eller OpenAI OCR er tilgaengelig.")
    if ENABLE_OCR and SAVE_IMAGES:
        print("Billeder gemmes i ./images/")
    if ENABLE_ATTACHMENTS and not (PdfReaderNew or PdfReaderOld):
        print("Bemærk: ingen PDF-parser installeret (pypdf/PyPDF2). PDF-vedhæftninger gemmes, men tekst kan mangle.")

    site_id = None
    if SITE_HINT:
        try:
            site_id = resolve_site_id(access_token, SITE_HINT)
            if site_id:
                print(f"Bruger SharePoint site: {SITE_HINT}  (id={site_id})")
        except Exception as ex:
            print(f"Kunne ikke resolve SITE_HINT '{SITE_HINT}': {ex}")

    pages = []

    # --- Direkte side-hentning hvis linket indeholder page-id ---
    raw_page_id, page_guid_hint = extract_page_id(TARGET_LINK or "")
    if raw_page_id:
        page_candidates = []
        # if the raw value already looks prefixed, try it first
        if raw_page_id.startswith("1-"):
            page_candidates.append(raw_page_id)
        # otherwise try OneNote's usual prefixed form
        if page_guid_hint and not raw_page_id.startswith("1-"):
            page_candidates.append(f"1-{page_guid_hint}")

        # As a last resort also try the bare guid (some tenants accept it)
        if not page_candidates:
            page_candidates = [raw_page_id]

        fetched = False
        last_error_text = None
        for pid_try in page_candidates:
            try:
                print(f"\n--- Direkte side-hentning (page-id={pid_try}) ---")
                html = fetch_page_content_html(
                    access_token,
                    page_id=pid_try,
                    site_id=site_id,
                    content_url=None
                )
                text, concepts = simplify_html_to_text(html)

                # Afled filnavns-base fra <title>
                base = safe_filename("Direkte side")
                try:
                    title_guess = BeautifulSoup(html, "lxml").title
                    if title_guess and title_guess.string:
                        base = safe_filename(title_guess.string)[:60] or base
                except Exception:
                    pass

                # OCR (brug resources hvis muligt – vi kender ikke Graph page-id her, så lad være med at sende det)
                ocr_text, found_imgs, with_text, saved_paths = ("", 0, 0, [])
                if ENABLE_OCR:
                    ocr_text, found_imgs, with_text, saved_paths = ocr_all_images_in_page_html(
                        access_token, html, site_id, save_prefix=base, page_graph_id=None
                    )
                    text_merged = (text + "\n\n[OCR]\n" + ocr_text).strip() if ocr_text else text
                else:
                    text_merged = text

                attachment_block, found_atts, with_atts, saved_att_paths, attachment_items = (
                    "[ATTACHMENTS]\nFound attachments: 0 | Attachments with extracted text: 0",
                    0,
                    0,
                    [],
                    [],
                )
                if ENABLE_ATTACHMENTS:
                    export_root_local = OUTDIR if OUTDIR else Path(os.getcwd())
                    export_root_local.mkdir(parents=True, exist_ok=True)
                    attachment_block, found_atts, with_atts, saved_att_paths, attachment_items = (
                        extract_attachments_from_page_html_to_dir(
                            access_token,
                            html,
                            site_id,
                            export_root=export_root_local,
                            page_id_safe=base,
                        )
                    )
                    text_merged = (text_merged + "\n\n" + attachment_block).strip()

                with open(out_path(f"{base}.html"), "w", encoding="utf-8") as f:
                    f.write(html)
                with open(out_path(f"{base}.txt"), "w", encoding="utf-8") as f:
                    f.write(text_merged)
                if ENABLE_OCR and ocr_text:
                    with open(out_path(f"{base}.ocr.txt"), "w", encoding="utf-8") as f:
                        f.write(ocr_text)
                with open(out_path(f"{base}.concepts.txt"), "w", encoding="utf-8") as f:
                    f.write("\n".join(concepts))
                if ENABLE_ATTACHMENTS and attachment_items:
                    write_json(out_path(f"{base}.attachments.json"), {"items": attachment_items})
                if ENABLE_OCR:
                    if ocr_text:
                        print(f"OCR: fandt {found_imgs} billeder, {with_text} gav tekst.")
                    else:
                        print(f"OCR: fandt {found_imgs} billeder, ingen gav tekst.")
                    print(f"Gemte: {base}.html, {base}.txt, {base}.concepts.txt" + (", {base}.ocr.txt" if ocr_text else ""))
                    if ENABLE_OCR and SAVE_IMAGES and saved_paths:
                        print("Gemte billeder:", ", ".join(saved_paths))
                else:
                    print(f"Gemte: {base}.html, {base}.txt, {base}.concepts.txt")
                if ENABLE_ATTACHMENTS:
                    print(f"Attachments: fandt {found_atts}, med tekst {with_atts}.")
                    if SAVE_ATTACHMENTS and saved_att_paths:
                        print("Gemte vedhæftninger:", ", ".join(saved_att_paths))
                fetched = True
                break
            except Exception as ex:
                last_error_text = str(ex)
                print(f"Kunne ikke hente page-id {pid_try}: {ex}")

        if fetched:
            return
        else:
            # Option 2: automatisk, målrettet sektion-scan når direkte page-id fejler
            if last_error_text and "Invalid Entity ID" in last_error_text:
                print("Direkte page-id er ikke et gyldigt Graph-ID (Invalid Entity ID). Scanner sektionen for at slå Graph-ID op …")
            else:
                print("Direkte page-id fejlede. Scanner sektionen for at slå Graph-ID op …")
            # Forsøg målrettet opslag: brug section-id fra linket og scan alle sider i sektionen til vi finder page-guid'en
            raw_sec_id, sec_guid_hint = extract_section_id(TARGET_LINK)
            resolved_sec_id = None
            if raw_sec_id:
                candidates = [raw_sec_id]
                if sec_guid_hint and not raw_sec_id.startswith("1-"):
                    candidates.insert(0, f"1-{sec_guid_hint}")
                for cid in candidates:
                    ok = False
                    if site_id:
                        ok, _ = try_get_site_section(access_token, site_id, cid)
                    if not ok:
                        ok, _ = try_get_section(access_token, cid)
                    if ok:
                        resolved_sec_id = cid
                        break

            # Hvis vi stadig ikke har en sektion: prøv at matche på sektionens navn fra linket
            if not resolved_sec_id and site_id:
                sec_name_hint = extract_section_name_hint(TARGET_LINK)
                if sec_name_hint:
                    print(f"Kunne ikke resolve sektion via ID. Forsøger navnsmatch: '{sec_name_hint}' …")
                    try:
                        secs = list_site_sections(access_token, site_id, top=1000)
                        for s in secs:
                            name = (s.get('displayName') or '').strip()
                            if name.lower() == sec_name_hint.lower():
                                resolved_sec_id = s.get('id')
                                print(f"Matchede sektion ved navn: '{name}' (id={resolved_sec_id})")
                                break
                    except Exception as _ex:
                        pass

            title_hint = extract_title_hint(TARGET_LINK)

            if resolved_sec_id:
                print(f"Forsøger sektion-scan for page-guid {page_guid_hint or '(ingen GUID)'} ...")
                if site_id:
                    all_pages = list_pages_in_site_section_all(access_token, site_id, resolved_sec_id, page_size=100)
                else:
                    all_pages = list_pages_in_section_all(access_token, resolved_sec_id, page_size=100)

                target = None
                for p in all_pages:
                    web = (((p.get('links') or {}).get('oneNoteWebUrl') or {}).get('href') or '')
                    title = (p.get('title') or '')
                    guid_match = (page_guid_hint and page_guid_hint.lower() in web.lower())
                    title_match = (title_hint and title_hint.lower() == title.lower())
                    if guid_match or title_match:
                        target = p
                        break

                if target:
                    print("Fandt siden via sektion-scan. Henter direkte fra contentUrl …")
                    html = fetch_page_content_html(
                        access_token,
                        page_id=target.get('id'),
                        site_id=site_id,
                        content_url=target.get('contentUrl')
                    )
                    text, concepts = simplify_html_to_text(html)

                    # Bestem basisfilnavn fra sidens titel eller title_hint
                    base = safe_filename(target.get('title') or title_hint or 'Direkte side')[:60]

                    # OCR via resources-endpoint
                    ocr_text, found_imgs, with_text, saved_paths = ("", 0, 0, [])
                    if ENABLE_OCR:
                        ocr_text, found_imgs, with_text, saved_paths = ocr_all_images_in_page_html(
                            access_token, html, site_id, save_prefix=base, page_graph_id=target.get('id')
                        )

                    # Flet OCR-tekst ind i hoved-tekst hvis der var noget
                    text_merged = (text + "\n\n[OCR]\n" + ocr_text).strip() if ocr_text else text

                    attachment_block, found_atts, with_atts, saved_att_paths, attachment_items = (
                        "[ATTACHMENTS]\nFound attachments: 0 | Attachments with extracted text: 0",
                        0,
                        0,
                        [],
                        [],
                    )
                    if ENABLE_ATTACHMENTS:
                        export_root_local = OUTDIR if OUTDIR else Path(os.getcwd())
                        export_root_local.mkdir(parents=True, exist_ok=True)
                        attachment_block, found_atts, with_atts, saved_att_paths, attachment_items = (
                            extract_attachments_from_page_html_to_dir(
                                access_token,
                                html,
                                site_id,
                                export_root=export_root_local,
                                page_id_safe=base,
                            )
                        )
                        text_merged = (text_merged + "\n\n" + attachment_block).strip()

                    with open(out_path(f"{base}.html"), "w", encoding="utf-8") as f:
                        f.write(html)
                    with open(out_path(f"{base}.txt"), "w", encoding="utf-8") as f:
                        f.write(text_merged)
                    if ENABLE_OCR and ocr_text:
                        with open(out_path(f"{base}.ocr.txt"), "w", encoding="utf-8") as f:
                            f.write(ocr_text)
                    with open(out_path(f"{base}.concepts.txt"), "w", encoding="utf-8") as f:
                        f.write("\n".join(concepts))
                    if ENABLE_ATTACHMENTS and attachment_items:
                        write_json(out_path(f"{base}.attachments.json"), {"items": attachment_items})

                    if ENABLE_OCR:
                        if ocr_text:
                            print(f"OCR: fandt {found_imgs} billeder, {with_text} gav tekst.")
                        else:
                            print(f"OCR: fandt {found_imgs} billeder, ingen gav tekst.")
                        print(f"Gemte: {base}.html, {base}.txt, {base}.concepts.txt" + (", {base}.ocr.txt" if ocr_text else ""))
                        if SAVE_IMAGES and saved_paths:
                            print("Gemte billeder:", ", ".join(saved_paths))
                    else:
                        print(f"Gemte: {base}.html, {base}.txt, {base}.concepts.txt")
                    if ENABLE_ATTACHMENTS:
                        print(f"Attachments: fandt {found_atts}, med tekst {with_atts}.")
                        if SAVE_ATTACHMENTS and saved_att_paths:
                            print("Gemte vedhæftninger:", ", ".join(saved_att_paths))
                    return
                else:
                    print("Kunne ikke finde siden via sektion-scan.")

            if EXACT_PAGE_ONLY:
                print("EXACT_PAGE_ONLY=True: Henter ikke hele sektionen som fallback. Afslutter.")
                return
            else:
                print("Direkte page-id kunne ikke hentes; forsøger sektion-faldback.")

    # 1) Hvis TARGET_LINK er sat (eller et helt link), brug den direkte
    if TARGET_LINK:
        raw_id, guid_hint = extract_section_id(TARGET_LINK)
        candidates = []
        # 1) hvis raw ser ud som et OneNote-id (starter med 1-), prøv det først
        if raw_id:
            candidates.append(raw_id)
        # 2) hvis vi har en GUID, prøv også "1-<GUID>"
        if guid_hint and not raw_id.startswith("1-"):
            candidates.append(f"1-{guid_hint}")

        resolved_id = None
        for cid in candidates:
            ok = False
            # 1) Try via site if available (covers read-only notebooks in class sites)
            if site_id:
                ok, _ = try_get_site_section(access_token, site_id, cid)
            # 2) Fallback to /me if not found
            if not ok:
                ok, _ = try_get_section(access_token, cid)
            if ok:
                resolved_id = cid
                break

        if not resolved_id:
            if site_id:
                # Prefer site-scoped enumeration to include read-only Content Libraries
                sec_name_hint = extract_section_name_hint(TARGET_LINK) or ""
                secs = list_site_sections(access_token, site_id, top=1000)
                print(f"Site-sektioner fundet: {len(secs)} | name-hint fra link: '{sec_name_hint}'")
                for s in secs:
                    web = (s.get('links') or {}).get('oneNoteWebUrl', {}).get('href') or ''
                    name = (s.get('displayName') or '')
                    name_l = name.lower()
                    if (
                        (guid_hint and guid_hint.lower() in web.lower())
                        or (TARGET_SECTION_NAME and TARGET_SECTION_NAME.lower() in name_l)
                        or (sec_name_hint and sec_name_hint.lower() in name_l)
                    ):
                        resolved_id = s.get('id')
                        print(f"Matchede sektion i site: '{name}' (id={resolved_id})")
                        break
            if not resolved_id:
                # Fallback: enumerate user notebooks then their sections (original logic)
                books = list_notebooks(access_token, top=200)
                candidate_books = []
                for nb in books:
                    web = (((nb.get('links') or {}).get('oneNoteWebUrl') or {}).get('href') or '').lower()
                    name = (nb.get('displayName') or '')
                    if SITE_HINT and SITE_HINT in web:
                        candidate_books.append(nb)
                    elif TARGET_NOTEBOOK_NAME and TARGET_NOTEBOOK_NAME.lower() in name.lower():
                        candidate_books.append(nb)
                if not candidate_books:
                    candidate_books = books
                for nb in candidate_books:
                    nb_id = nb.get('id')
                    nb_name = nb.get('displayName')
                    print(f"Scanner sektioner i notesbog: {nb_name}")
                    secs = list_sections_in_notebook(access_token, nb_id, top=300)
                    for s in secs:
                        web = (s.get('links') or {}).get('oneNoteWebUrl', {}).get('href') or ''
                        name = (s.get('displayName') or '')
                        sec_name_hint = extract_section_name_hint(TARGET_LINK) or ""
                        if (
                            (guid_hint and guid_hint.lower() in web.lower())
                            or (TARGET_SECTION_NAME and TARGET_SECTION_NAME.lower() in name.lower())
                            or (sec_name_hint and sec_name_hint.lower() in name.lower())
                        ):
                            resolved_id = s.get('id')
                            break
                    if resolved_id:
                        break
            if not resolved_id:
                # Last fallback: global sections
                sections = list_sections(access_token, top=500)
                for s in sections:
                    web = (s.get('links') or {}).get('oneNoteWebUrl', {}).get('href') or ''
                    name = (s.get('displayName') or '')
                    sec_name_hint = extract_section_name_hint(TARGET_LINK) or ""
                    if (
                        (guid_hint and guid_hint.lower() in web.lower())
                        or (TARGET_SECTION_NAME and TARGET_SECTION_NAME.lower() in name.lower())
                        or (sec_name_hint and sec_name_hint.lower() in name.lower())
                    ):
                        resolved_id = s.get('id')
                        break

        if not resolved_id:
            print("Fandt ingen sektion via ID eller fallback-match. Tjek linket eller dine rettigheder.")
            return

        print(f"\n--- Bruger sektion-id: {resolved_id} ---")

        # Hvis vi vil eksportere hele sektionen: brug pagination og skriv en samlet eksportmappe
        if EXPORT_WHOLE_SECTION:
            section_display = TARGET_SECTION_NAME
            try:
                if site_id:
                    ok, sec_obj = try_get_site_section(access_token, site_id, resolved_id)
                else:
                    ok, sec_obj = try_get_section(access_token, resolved_id)
                if ok:
                    section_display = (sec_obj.get("displayName") or section_display) or section_display
            except Exception:
                pass

            # Incremental export: reuse previous export root for this section/site when possible.
            cwd = OUTDIR if OUTDIR else Path(os.getcwd())
            cwd.mkdir(parents=True, exist_ok=True)
            existing_root = find_existing_export_root(
                resolved_id,
                site_id,
                cwd,
                section_name_hint=section_display,
            )

            if existing_root and existing_root.exists():
                export_root = existing_root
                # Ensure expected subfolders exist (for older exports)
                (export_root / "pages").mkdir(exist_ok=True)
                (export_root / "images").mkdir(exist_ok=True)
                (export_root / "attachments").mkdir(exist_ok=True)
                (export_root / "cache" / "ocr").mkdir(parents=True, exist_ok=True)
                (export_root / "cache" / "captions").mkdir(parents=True, exist_ok=True)
                print(f"Opdaterer eksisterende eksport (incremental): {export_root}")
            else:
                export_root = make_export_root(section_display)
                print(f"Eksporterer hele sektionen til: {export_root}")

            print(f"EXPORT_ROOT={export_root}")

            # Write early manifest so interrupted runs can be resumed on next fetch.
            manifest_bootstrap = {
                "section_name": section_display,
                "section_id": resolved_id,
                "site_id": site_id,
                "exported_at": __import__("datetime").datetime.now().isoformat(),
                "page_count": 0,
                "scopes": SCOPES,
                "incremental": True,
                "updated_pages": None,
                "status": "in_progress",
            }
            write_json(export_root / "manifest.json", manifest_bootstrap)

            if site_id:
                pages_all = list_pages_in_site_section_all(access_token, site_id, resolved_id, page_size=100)
            else:
                pages_all = list_pages_in_section_all(access_token, resolved_id, page_size=100)

            # Load existing meta so we can skip unchanged pages.
            existing_meta_by_id: dict[str, dict] = {}
            index_path = export_root / "index.jsonl"
            if index_path.exists():
                try:
                    import json
                    with open(index_path, "r", encoding="utf-8") as f:
                        for line in f:
                            line = line.strip()
                            if not line:
                                continue
                            obj = json.loads(line)
                            pid0 = obj.get("id")
                            if pid0:
                                existing_meta_by_id[str(pid0)] = obj
                except Exception:
                    existing_meta_by_id = {}

            def needs_fetch(page_obj: dict) -> bool:
                pid0 = str(page_obj.get("id") or "")
                if not pid0:
                    return True
                old = existing_meta_by_id.get(pid0)
                if not old:
                    return True
                old_lm = str(old.get("lastModifiedDateTime") or "")
                new_lm = str(page_obj.get("lastModifiedDateTime") or "")
                if not old_lm or not new_lm:
                    return True
                return old_lm != new_lm

            pages_to_fetch = [p for p in pages_all if needs_fetch(p)]
            print(f"Incremental: {len(pages_to_fetch)}/{len(pages_all)} sider er nye/ændrede.")

            if not pages_to_fetch:
                manifest = {
                    "section_name": section_display,
                    "section_id": resolved_id,
                    "site_id": site_id,
                    "exported_at": __import__("datetime").datetime.now().isoformat(),
                    "page_count": len(pages_all),
                    "scopes": SCOPES,
                    "incremental": True,
                    "updated_pages": 0,
                    "status": "completed",
                }
                write_json(export_root / "manifest.json", manifest)
                print("Ingen nye ændringer siden sidste eksport — springer download over.")
                return

            updated_meta_by_id = dict(existing_meta_by_id)

            index_path = export_root / "index.jsonl"
            with open(index_path, "w", encoding="utf-8") as index_f:
                for i, p in enumerate(pages_to_fetch, start=1):
                    pid = p.get("id")
                    title = p.get("title") or pid or "(untitled)"
                    safe_pid = safe_filename(pid or f"page_{i}")[:80]
                    page_dir = export_root / "pages" / safe_pid
                    page_dir.mkdir(parents=True, exist_ok=True)

                    print(f"[{i}/{len(pages_all)}] Henter indhold: {title}")

                    html = fetch_page_content_html(
                        access_token,
                        page_id=pid,
                        site_id=site_id,
                        content_url=p.get("contentUrl"),
                    )

                    text, concepts = simplify_html_to_text(html)

                    ocr_text, found_imgs, with_text, saved_paths, image_items = ("", 0, 0, [], [])
                    attachment_block = "[ATTACHMENTS]\nFound attachments: 0 | Attachments with extracted text: 0"
                    found_atts, with_text_atts = 0, 0
                    saved_att_paths: list[str] = []
                    attachment_items: list[dict] = []

                    # Kør billed-download/OCR hvis enten SAVE_IMAGES eller ENABLE_OCR er slået til
                    if SAVE_IMAGES or ENABLE_OCR:
                        ocr_text, found_imgs, with_text, saved_paths, image_items = ocr_all_images_in_page_html_to_dir(
                            access_token,
                            html,
                            site_id,
                            export_root=export_root,
                            page_graph_id=pid,
                            page_id_safe=safe_pid,
                        )

                    if ENABLE_ATTACHMENTS:
                        attachment_block, found_atts, with_text_atts, saved_att_paths, attachment_items = (
                            extract_attachments_from_page_html_to_dir(
                                access_token,
                                html,
                                site_id,
                                export_root=export_root,
                                page_id_safe=safe_pid,
                            )
                        )

                    # Byg en [IMAGES]-blok så billed-tunge sider bliver søgbare, selv hvis OCR er tom
                    images_block_lines = []
                    images_block_lines.append("[IMAGES]")
                    images_block_lines.append(f"Found images: {found_imgs} | Images with OCR text: {with_text}")
                    if image_items:
                        for it in image_items:
                            ip = (it.get("image_path") or "").strip()
                            t = (it.get("ocr_text") or "").strip()
                            if not ip:
                                continue
                            images_block_lines.append(f"- {ip}")
                            if t:
                                images_block_lines.append("  OCR:")
                                for line in t.splitlines():
                                    images_block_lines.append("    " + line)
                            cap = (it.get("caption") or "").strip()
                            if cap:
                                images_block_lines.append("  CAPTION:")
                                for line in cap.splitlines():
                                    images_block_lines.append("    " + line)
                    images_block = "\n".join(images_block_lines).strip()

                    merged = text

                    # Behold gammel [OCR]-sektion for kompatibilitet
                    if ocr_text:
                        merged = (merged + "\n\n[OCR]\n" + ocr_text).strip()

                    # VIGTIGT: altid append [IMAGES]
                    merged = (merged + "\n\n" + images_block).strip()
                    if ENABLE_ATTACHMENTS:
                        merged = (merged + "\n\n" + attachment_block).strip()

                    # Gem per-image OCR maskinlæsbart (valgfrit men praktisk)
                    if image_items:
                        write_json(page_dir / "images_ocr.json", {"items": image_items})
                    if attachment_items:
                        write_json(page_dir / "attachments_extract.json", {"items": attachment_items})

                    write_text(page_dir / "page.html", html)
                    write_text(page_dir / "page.txt", merged)
                    if ocr_text:
                        write_text(page_dir / "page.ocr.txt", ocr_text)
                    write_text(page_dir / "concepts.txt", "\n".join(concepts))

                    meta = {
                        "id": pid,
                        "title": title,
                        "createdDateTime": p.get("createdDateTime"),
                        "lastModifiedDateTime": p.get("lastModifiedDateTime"),
                        "links": p.get("links"),
                        "contentUrl": p.get("contentUrl"),
                        "found_images": found_imgs,
                        "images_with_text": with_text,
                        "saved_images": saved_paths if SAVE_IMAGES else [],
                        "found_attachments": found_atts,
                        "attachments_with_text": with_text_atts,
                        "saved_attachments": saved_att_paths if SAVE_ATTACHMENTS else [],
                        "concepts": concepts,
                        "text_path": str((page_dir / "page.txt").relative_to(export_root)),
                        "ocr_path": str((page_dir / "page.ocr.txt").relative_to(export_root)) if ocr_text else None,
                        "images_ocr_path": str((page_dir / "images_ocr.json").relative_to(export_root)) if image_items else None,
                        "attachments_extract_path": str((page_dir / "attachments_extract.json").relative_to(export_root)) if attachment_items else None,
                        "captions_enabled": ENABLE_IMAGE_CAPTIONS,
                        "vision_model": VISION_MODEL if ENABLE_IMAGE_CAPTIONS else None,
                    }
                    write_json(page_dir / "meta.json", meta)
                    updated_meta_by_id[str(pid)] = meta
                # Write complete index.jsonl (unchanged + updated) in the order of pages_all
                import json
                for p_all in pages_all:
                    pid0 = str(p_all.get("id") or "")
                    if not pid0:
                        continue
                    meta_obj = updated_meta_by_id.get(pid0)
                    if meta_obj:
                        index_f.write(json.dumps(meta_obj, ensure_ascii=False) + "\n")
            manifest = {
                "section_name": section_display,
                "section_id": resolved_id,
                "site_id": site_id,
                "exported_at": __import__("datetime").datetime.now().isoformat(),
                "page_count": len(pages_all),
                "scopes": SCOPES,
                "incremental": True,
                "updated_pages": len(pages_to_fetch),
                "status": "completed",
            }

            write_json(export_root / "manifest.json", manifest)
            print("Færdig: manifest.json + index.jsonl + pages/*")
            return

        # Ellers: gammel opførsel (hent MAX_PAGES)
        pages = (
            list_pages_in_site_section(access_token, site_id, resolved_id, top=MAX_PAGES)
            if site_id
            else list_pages_in_section(access_token, resolved_id, top=MAX_PAGES)
        )
        if not pages:
            print("Ingen sider fundet i den valgte sektion (efter resolve).")
    else:
        # 2) Ellers scan sektioner og match på navn (delmatch, case-insensitive)
        print(f"\n--- Finder sektion(er) der matcher: {TARGET_SECTION_NAME} ---")
        if site_id:
            sections = list_site_sections(access_token, site_id, top=1000)
        else:
            sections = list_sections(access_token, top=500)

        def match_section(s):
            name_ok = TARGET_SECTION_NAME.lower() in (s.get("displayName") or "").lower()
            if not name_ok:
                return False
            if TARGET_NOTEBOOK_NAME:
                nb = ((s.get("parentNotebook") or {}).get("displayName") or "")
                return TARGET_NOTEBOOK_NAME.lower() in nb.lower()
            return True

        wanted_sections = [s for s in sections if match_section(s)]

        if not wanted_sections:
            print("Fandt ingen sektioner der matcher. Tjek TARGET_SECTION_NAME / TARGET_NOTEBOOK_NAME eller sæt TARGET_SECTION_ID.")
            return

        pages = []
        for s in wanted_sections:
            sid = s["id"]
            nb = (s.get("parentNotebook") or {}).get("displayName")
            print(f"Henter sider fra sektion: '{s.get('displayName')}' i notesbog: '{nb}' (id={sid})")
            s_pages = (list_pages_in_site_section(access_token, site_id, sid, top=MAX_PAGES) if site_id else list_pages_in_section(access_token, sid, top=MAX_PAGES))
            pages.extend(s_pages)
            if len(pages) >= MAX_PAGES:
                pages = pages[:MAX_PAGES]
                break


    # --- Hent og gem sider fra den/de fundne sektion(er) ---
    if not pages:
        print("Ingen sider at hente.")
        return

    for p in pages[:MAX_PAGES]:
        pid = p.get("id")
        title = p.get("title") or pid
        print(f"\n--- Henter indhold: {title} ---")
        html = fetch_page_content_html(
            access_token,
            page_id=pid,
            site_id=site_id,
            content_url=p.get("contentUrl")
        )
        text, concepts = simplify_html_to_text(html)
        ocr_text, found_imgs, with_text, saved_paths = ("", 0, 0, [])
        base = safe_filename(title)[:60] or (pid[:12] if pid else "page")
        if ENABLE_OCR:
            ocr_text, found_imgs, with_text, saved_paths = ocr_all_images_in_page_html(
                access_token, html, site_id, save_prefix=base, page_graph_id=pid
            )
            if ocr_text:
                text_merged = (text + "\n\n[OCR]\n" + ocr_text).strip()
            else:
                text_merged = text
        else:
            text_merged = text

        attachment_block, found_atts, with_atts, saved_att_paths, attachment_items = (
            "[ATTACHMENTS]\nFound attachments: 0 | Attachments with extracted text: 0",
            0,
            0,
            [],
            [],
        )
        if ENABLE_ATTACHMENTS:
            export_root_local = OUTDIR if OUTDIR else Path(os.getcwd())
            export_root_local.mkdir(parents=True, exist_ok=True)
            attachment_block, found_atts, with_atts, saved_att_paths, attachment_items = (
                extract_attachments_from_page_html_to_dir(
                    access_token,
                    html,
                    site_id,
                    export_root=export_root_local,
                    page_id_safe=base,
                )
            )
            text_merged = (text_merged + "\n\n" + attachment_block).strip()

        with open(out_path(f"{base}.html"), "w", encoding="utf-8") as f:
            f.write(html)
        with open(out_path(f"{base}.txt"), "w", encoding="utf-8") as f:
            f.write(text_merged)
        if ENABLE_OCR and ocr_text:
            with open(out_path(f"{base}.ocr.txt"), "w", encoding="utf-8") as f:
                f.write(ocr_text)
        with open(out_path(f"{base}.concepts.txt"), "w", encoding="utf-8") as f:
            f.write("\n".join(concepts))
        if ENABLE_ATTACHMENTS and attachment_items:
            write_json(out_path(f"{base}.attachments.json"), {"items": attachment_items})
        if ENABLE_OCR:
            if ocr_text:
                print(f"OCR: fandt {found_imgs} billeder, {with_text} gav tekst.")
            else:
                print(f"OCR: fandt {found_imgs} billeder, ingen gav tekst.")
            print(f"Gemte: {base}.html, {base}.txt, {base}.concepts.txt" + (", {base}.ocr.txt" if ocr_text else ""))
            if ENABLE_OCR and SAVE_IMAGES and saved_paths:
                print("Gemte billeder:", ", ".join(saved_paths))
        else:
            print(f"Gemte: {base}.html, {base}.txt, {base}.concepts.txt")
        if ENABLE_ATTACHMENTS:
            print(f"Attachments: fandt {found_atts}, med tekst {with_atts}.")
            if SAVE_ATTACHMENTS and saved_att_paths:
                print("Gemte vedhæftninger:", ", ".join(saved_att_paths))


if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        # Hyppige fejl: AADSTS65001 (kræver admin consent), 403/401 på scopes, public client off
        print(f"Fejl: {e}", file=sys.stderr)
        sys.exit(2)
