feat: add incremental

2026-03-16 16:34:55 -04:00
parent 25c2f7aea0
commit 52fcba17e5
1 changed files with 196 additions and 87 deletions
@@ -34,6 +34,8 @@ if not COLLECTION_NAME:
 EMBEDDING_MODEL_NAME = os.environ.get("EMBEDDING_MODEL_NAME", "BAAI/bge-small-en-v1.5")
 BATCH_SIZE = 50
 METADATA_COLLECTION = "mcp_indexer_metadata"
 INCREMENTAL_DAYS = int(os.environ.get("INCREMENTAL_DAYS", "7"))
 def decode_mime_words(s: str) -> str:
    """Decodes MIME encoded strings (e.g. subjects, filenames)."""
@@ -145,6 +147,86 @@ def init_qdrant_collection(client: QdrantClient, vector_size: int):
        field_schema=models.PayloadSchemaType.KEYWORD,
    )
 def init_metadata_collection(client: QdrantClient):
    """Ensures the indexer metadata collection exists in Qdrant."""
    collections = client.get_collections().collections
    if not any(c.name == METADATA_COLLECTION for c in collections):
        print(f"Creating metadata collection '{METADATA_COLLECTION}'...")
        client.create_collection(
            collection_name=METADATA_COLLECTION,
            # Minimal vector (size=1) — we only use this collection for payload storage
            vectors_config=models.VectorParams(size=1, distance=models.Distance.COSINE),
        )
 def is_bootstrap_done(client: QdrantClient) -> bool:
    """Returns True if a successful full bootstrap has already been recorded."""
    try:
        results, _ = client.scroll(
            collection_name=METADATA_COLLECTION,
            scroll_filter=models.Filter(
                must=[
                    models.FieldCondition(
                        key="event",
                        match=models.MatchValue(value="bootstrap_complete"),
                    )
                ]
            ),
            limit=1,
        )
        return len(results) > 0
    except Exception as e:
        print(f"Warning: could not check bootstrap state: {e}")
        return False
 def mark_bootstrap_done(client: QdrantClient):
    """Records a bootstrap_complete event in the metadata collection."""
    point_id = str(uuid.uuid5(uuid.NAMESPACE_OID, "bootstrap_complete"))
    client.upsert(
        collection_name=METADATA_COLLECTION,
        points=[
            models.PointStruct(
                id=point_id,
                vector=[0.0],  # placeholder — collection is payload-only
                payload={
                    "event": "bootstrap_complete",
                    "timestamp": datetime.now().isoformat(),
                },
            )
        ],
    )
    print("Bootstrap state recorded in Qdrant metadata collection.")
 def get_recent_keys(mbox: mailbox.Maildir, days: int) -> set:
    """
    Returns the set of Maildir keys whose backing file has been modified
    within the last `days` days (based on filesystem mtime).
    """
    from datetime import timezone, timedelta
    cutoff = datetime.now(tz=timezone.utc) - timedelta(days=days)
    recent = set()
    maildir_root = mbox._path  # type: ignore[attr-defined]
    for key in mbox.keys():
        # Maildir keys are stored under cur/ or new/ — try both
        for subdir in ("cur", "new"):
            file_path = os.path.join(maildir_root, subdir, key)
            if os.path.isfile(file_path):
                try:
                    mtime = os.path.getmtime(file_path)
                    mtime_dt = datetime.fromtimestamp(mtime, tz=timezone.utc)
                    if mtime_dt >= cutoff:
                        recent.add(key)
                except OSError:
                    pass
                break  # found the file, no need to check the other subdir
    return recent
 def main():
    """
    Main ingestion function.
@@ -159,26 +241,49 @@ def main():
    # Initialize model
    print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
    model = TextEmbedding(model_name=EMBEDDING_MODEL_NAME)
-    vector_size = len(next(iter(model.embed(["dimension_probe"])) ))
+    vector_size = len(next(iter(model.embed(["dimension_probe"]))))
    # Initialize Qdrant
    print("Connecting to Qdrant...")
    qdrant_client = QdrantClient(url=QDRANT_URL)
    init_qdrant_collection(qdrant_client, vector_size)
    init_metadata_collection(qdrant_client)
    # Determine indexing mode: full bootstrap or incremental update
    if is_bootstrap_done(qdrant_client):
        mode = "incremental"
        print(f"[MODE] INCREMENTAL — scanning only files modified in the last {INCREMENTAL_DAYS} days.")
    else:
        mode = "full"
        print("[MODE] BOOTSTRAP — full scan of all emails (first-time indexing).")
    points = []
    has_error = False
    # Iterate and parse over all maildir directories found in MAILDIR_PATH
    for root, dirs, files in os.walk(MAILDIR_PATH):
        # A valid Maildir has 'cur', 'new', and 'tmp' subdirectories
-        if all(subdir in dirs for subdir in ['cur', 'new', 'tmp']):
+        if not all(subdir in dirs for subdir in ['cur', 'new', 'tmp']):
            continue
        print(f"Processing Maildir found at: {root}")
        mbox = mailbox.Maildir(root)
        total_emails_in_dir = len(mbox)
            print(f"Found {total_emails_in_dir} emails in this directory.")
-            for idx, (key, msg) in enumerate(mbox.items()):
+        if mode == "incremental":
            keys_to_process = get_recent_keys(mbox, INCREMENTAL_DAYS)
            print(
                f"Found {total_emails_in_dir} emails total, "
                f"{len(keys_to_process)} modified in the last {INCREMENTAL_DAYS} days."
            )
        else:
            keys_to_process = set(mbox.keys())
            print(f"Found {total_emails_in_dir} emails — indexing all.")
        for idx, key in enumerate(keys_to_process):
            try:
                msg = mbox[key]
                # Parse headers
                subject = decode_mime_words(msg.get("Subject", "No Subject"))
                sender_raw = decode_mime_words(msg.get("From", "Unknown"))
@@ -216,8 +321,7 @@ def main():
                    f"Attachments: {attachments_str}"
                )
-                    # Embed the text
+                # Embed the text — fastembed returns an iterable of numpy arrays
                    # Fastembed returns an iterable of numpy arrays
                embeddings = list(model.embed([vector_text]))
                vector = embeddings[0].tolist()
@@ -231,7 +335,7 @@ def main():
                    "receiver_raw": receiver_raw,
                    "subject": subject,
                    "body_text": body_text,
-                        "attachments": attachments
+                    "attachments": attachments,
                }
                # Assign deterministic UUID point ID based on message_id
@@ -240,28 +344,33 @@ def main():
                points.append(models.PointStruct(
                    id=point_id,
                    vector=vector,
-                        payload=payload
+                    payload=payload,
                ))
                # Push in batches
                if len(points) >= BATCH_SIZE:
                    qdrant_client.upsert(
                        collection_name=COLLECTION_NAME,
-                            points=points
+                        points=points,
                    )
-                        print(f"Processed {idx + 1}/{total_emails_in_dir} emails in current directory...")
+                    print(f"  Upserted batch — {idx + 1}/{len(keys_to_process)} emails processed in current directory.")
                    points = []
            except Exception as e:
                print(f"Error processing email key={key}: {e}")
                has_error = True
    # Push remaining points
    if points:
        qdrant_client.upsert(
            collection_name=COLLECTION_NAME,
-            points=points
+            points=points,
        )
    # Record bootstrap completion so subsequent runs use incremental mode
    if mode == "full" and not has_error:
        mark_bootstrap_done(qdrant_client)
    print("Indexing completed successfully!")
 if __name__ == "__main__":