#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
import os
import subprocess
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional

import numpy as np
import hnswlib

import llm_provider


def _resolve_openai_key(*, api_key: Optional[str], api_key_env: Optional[str]) -> Optional[str]:
    direct = (api_key or "").strip()
    if direct:
        return direct
    env_name = (api_key_env or "").strip()
    if env_name:
        return os.getenv(env_name)
    return os.getenv("OPENAI_API_KEY")

def run_chunk_note(export_root: Path, chunks_path: Path) -> None:
    """Rebuild chunks.jsonl for an export root (overwrites output)."""
    chunk_script = (Path(__file__).parent / "chunk_note.py").resolve()
    if not chunk_script.exists():
        raise RuntimeError(f"Kunne ikke finde chunk_note.py ved siden af build_index.py: {chunk_script}")

    chunks_path.parent.mkdir(parents=True, exist_ok=True)
    cmd = [
        sys.executable,
        str(chunk_script),
        "--input",
        str(export_root),
        "--output",
        str(chunks_path),
    ]
    print("Re-chunk:", " ".join(cmd))
    subprocess.run(cmd, check=True)

def read_chunks_jsonl(path: Path) -> List[Dict[str, Any]]:
    chunks = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            chunks.append(json.loads(line))
    return chunks


def make_embed_text(c: Dict[str, Any]) -> str:
    """Create the text we embed for retrieval.

    Important: include title/headings so queries that mention a page title can retrieve it.
    """
    title = (c.get("title") or "").strip()
    heading_path = c.get("heading_path") or []
    headings = " > ".join(h for h in heading_path if isinstance(h, str) and h.strip())
    body = (c.get("text") or "").strip()

    parts = []
    if title:
        parts.append(f"TITLE: {title}")
    if headings:
        parts.append(f"HEADINGS: {headings}")
    parts.append("TEXT:")
    parts.append(body)
    return "\n".join(parts)


def build_index_from_chunks(
    chunks_path: Path,
    out_dir: Path,
    embed_model: str = "nomic-embed-text",
    provider: str = "ollama",
    host: str = "http://localhost:11434",
    api_base: str = "",
    api_key: Optional[str] = None,
    batch: int = 64,
    ef_construction: int = 200,
    M: int = 16,
) -> tuple[Path, Path, Path]:
    """Build embeddings + HNSW index from chunks.jsonl.

    Returns (index_path, meta_path, cfg_path).
    """
    out_dir.mkdir(parents=True, exist_ok=True)

    chunks = read_chunks_jsonl(chunks_path)
    if not chunks:
        raise RuntimeError(f"Ingen chunks fundet i: {chunks_path}")

    texts = [make_embed_text(c) for c in chunks]

    # Embed i batches
    all_vecs: List[np.ndarray] = []
    for i in range(0, len(texts), batch):
        batch_texts = texts[i : i + batch]
        vecs = llm_provider.embed_batch(
            batch_texts,
            provider=provider,
            model=embed_model,
            host=host,
            api_base=api_base,
            api_key=api_key,
            timeout=120,
        )
        all_vecs.append(vecs)
        print(f"Embeddings: {min(i+batch, len(texts))}/{len(texts)}")

    X = np.vstack(all_vecs).astype(np.float32)
    n, dim = X.shape
    print(f"Embedding matrix: n={n} dim={dim}")

    # Build HNSW index
    index = hnswlib.Index(space="cosine", dim=dim)
    index.init_index(max_elements=n, ef_construction=ef_construction, M=M)
    index.add_items(X, np.arange(n))
    index.set_ef(50)

    # Save index + metadata
    index_path = out_dir / "chunks_hnsw.bin"
    meta_path = out_dir / "chunks_meta.jsonl"
    cfg_path = out_dir / "index_config.json"

    index.save_index(str(index_path))

    with meta_path.open("w", encoding="utf-8") as f:
        for c in chunks:
            f.write(
                json.dumps(
                    {
                        "chunk_id": c.get("chunk_id"),
                        "doc_id": c.get("doc_id"),
                        "source_path": c.get("source_path"),
                        "title": c.get("title"),
                        "source_url": c.get("source_url"),
                        "source_client_url": c.get("source_client_url"),
                        "heading_path": c.get("heading_path", []),
                        "chunk_index": c.get("chunk_index"),
                        "token_count": c.get("token_count"),
                        "text": c.get("text"),
                    },
                    ensure_ascii=False,
                )
                + "\n"
            )

    cfg = {
        "provider": (provider or "ollama").strip().lower(),
        "embed_model": embed_model,
        "host": host,
        "api_base": api_base,
        "dim": int(dim),
        "count": int(n),
        "space": "cosine",
    }
    cfg_path.write_text(json.dumps(cfg, ensure_ascii=False, indent=2), encoding="utf-8")

    print("Wrote:")
    print(f"- {index_path}")
    print(f"- {meta_path}")
    print(f"- {cfg_path}")

    return index_path, meta_path, cfg_path


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--chunks", default=None, help="Path til chunks.jsonl")
    ap.add_argument("--out_dir", default=None, help="Output mappe til indexfiler (default: samme mappe som chunks)")
    ap.add_argument("--export_root", default=None, help="Peg på en export_* mappe (bruges til default paths i index/)")
    ap.add_argument("--no_rechunk", action="store_true", help="Spring re-chunk over (ellers rebuildes chunks automatisk)")
    ap.add_argument("--provider", default="ollama", choices=["ollama", "openai"], help="Embedding/chat provider")
    ap.add_argument("--embed_model", default="nomic-embed-text", help="Embedding model")
    ap.add_argument("--host", default="http://localhost:11434", help="Ollama host")
    ap.add_argument("--api_base", default="", help="OpenAI-compatible base URL (optional)")
    ap.add_argument("--api_key", default="", help="API key for provider (optional; otherwise env)")
    ap.add_argument("--api_key_env", default="OPENAI_API_KEY", help="Env var name to read API key from")
    ap.add_argument("--batch", type=int, default=64, help="Batch size til embeddings")
    ap.add_argument("--ef_construction", type=int, default=200)
    ap.add_argument("--M", type=int, default=16)
    args = ap.parse_args()

    if args.export_root:
        export_root = Path(args.export_root).expanduser().resolve()
        chunks_path = (export_root / "index" / "chunks.jsonl").resolve()
        out_dir = Path(args.out_dir).expanduser().resolve() if args.out_dir else (export_root / "index").resolve()

        if not args.no_rechunk:
            run_chunk_note(export_root, chunks_path)
    else:
        if not args.chunks:
            raise SystemExit("Angiv --chunks <path> eller --export_root <export_mappe>.")
        chunks_path = Path(args.chunks).expanduser().resolve()
        out_dir = Path(args.out_dir).expanduser().resolve() if args.out_dir else chunks_path.parent.resolve()

    api_key = _resolve_openai_key(api_key=args.api_key, api_key_env=args.api_key_env)
    build_index_from_chunks(
        chunks_path=chunks_path,
        out_dir=out_dir,
        embed_model=args.embed_model,
        provider=args.provider,
        host=args.host,
        api_base=args.api_base,
        api_key=api_key,
        batch=args.batch,
        ef_construction=args.ef_construction,
        M=args.M,
    )

    return 0


if __name__ == "__main__":
    raise SystemExit(main())
