fix: metadata
Some checks failed
perso/mcp-maildir/pipeline/head Something is wrong with the build of this commit
Some checks failed
perso/mcp-maildir/pipeline/head Something is wrong with the build of this commit
This commit is contained in:
@@ -5,6 +5,7 @@ Indexer script to parse emails from Maildir and push them to Qdrant.
|
||||
import os
|
||||
import email
|
||||
import mailbox
|
||||
import warnings
|
||||
from concurrent.futures import ThreadPoolExecutor, Future
|
||||
from datetime import datetime
|
||||
from email.utils import parsedate_to_datetime, parseaddr
|
||||
@@ -39,6 +40,7 @@ BATCH_SIZE = int(os.environ.get("BATCH_SIZE", "100"))
|
||||
EMBEDDING_BATCH_SIZE = int(os.environ.get("EMBEDDING_BATCH_SIZE", "64"))
|
||||
METADATA_COLLECTION = "mcp_indexer_metadata"
|
||||
INCREMENTAL_DAYS = int(os.environ.get("INCREMENTAL_DAYS", "7"))
|
||||
FORCE_REINDEX = os.environ.get("FORCE_REINDEX", "").lower() in ("1", "true", "yes")
|
||||
|
||||
def decode_mime_words(s: str) -> str:
|
||||
"""Decodes MIME encoded strings (e.g. subjects, filenames)."""
|
||||
@@ -72,7 +74,7 @@ def normalize_email_address(value: str) -> str:
|
||||
return (addr or value).strip().lower()
|
||||
|
||||
|
||||
def parse_email_message(msg: mailbox.Message) -> Tuple[str, List[str]]:
|
||||
def parse_email_message(msg: mailbox.Message, email_id: str = "") -> Tuple[str, List[str]]:
|
||||
"""Extracts plain text body and a list of attachment filenames."""
|
||||
body_parts = []
|
||||
attachments = []
|
||||
@@ -99,16 +101,24 @@ def parse_email_message(msg: mailbox.Message) -> Tuple[str, List[str]]:
|
||||
if payload:
|
||||
charset = part.get_content_charset('utf-8') or 'utf-8'
|
||||
if isinstance(payload, bytes):
|
||||
text = payload.decode(charset, errors='replace')
|
||||
try:
|
||||
text = payload.decode(charset, errors='replace')
|
||||
except (LookupError, UnicodeDecodeError):
|
||||
# Unknown or broken charset — fall back to utf-8
|
||||
print(f" Warning: unknown charset '{charset}', falling back to utf-8 [{email_id}]")
|
||||
text = payload.decode('utf-8', errors='replace')
|
||||
else:
|
||||
text = str(payload)
|
||||
|
||||
if content_type == "text/html":
|
||||
text = extract_text_from_html(text)
|
||||
with warnings.catch_warnings(record=True) as caught:
|
||||
warnings.simplefilter("always")
|
||||
text = extract_text_from_html(text)
|
||||
for w in caught:
|
||||
print(f" Warning: {w.category.__name__}: {w.message} [{email_id}]")
|
||||
body_parts.append(text)
|
||||
except Exception as e:
|
||||
print(f"Error extracting payload: {e}")
|
||||
pass
|
||||
print(f" Warning: error extracting payload: {e} [{email_id}]")
|
||||
|
||||
return "\n".join(body_parts).strip(), attachments
|
||||
|
||||
@@ -255,6 +265,17 @@ def main():
|
||||
# Initialize Qdrant
|
||||
print("Connecting to Qdrant...")
|
||||
qdrant_client = QdrantClient(url=QDRANT_URL)
|
||||
|
||||
# Force reindex: wipe existing collections to start from scratch
|
||||
if FORCE_REINDEX:
|
||||
print("[FORCE_REINDEX] Deleting existing collections for a clean re-bootstrap...")
|
||||
for col_name in (COLLECTION_NAME, METADATA_COLLECTION):
|
||||
try:
|
||||
qdrant_client.delete_collection(collection_name=col_name)
|
||||
print(f" Deleted collection '{col_name}'.")
|
||||
except Exception:
|
||||
print(f" Collection '{col_name}' did not exist, skipping.")
|
||||
|
||||
init_qdrant_collection(qdrant_client, vector_size)
|
||||
init_metadata_collection(qdrant_client)
|
||||
|
||||
@@ -343,10 +364,15 @@ def main():
|
||||
receiver_raw = decode_mime_words(msg.get("To", "Unknown"))
|
||||
sender = normalize_email_address(sender_raw)
|
||||
receiver = normalize_email_address(receiver_raw)
|
||||
message_id = msg.get("Message-ID", str(uuid.uuid4()))
|
||||
message_id_raw = msg.get("Message-ID")
|
||||
message_id = str(message_id_raw) if message_id_raw is not None else str(uuid.uuid4())
|
||||
|
||||
# Parse date
|
||||
date_str = msg.get("Date")
|
||||
# Parse date — msg.get() may return an email.header.Header
|
||||
# object instead of str when the header contains non-ASCII
|
||||
# bytes (e.g. timezone comments like "heure d'été").
|
||||
# We must coerce to str before parsing.
|
||||
date_raw = msg.get("Date")
|
||||
date_str = str(date_raw) if date_raw is not None else None
|
||||
dt_obj = None
|
||||
if date_str:
|
||||
try:
|
||||
@@ -354,11 +380,13 @@ def main():
|
||||
except Exception:
|
||||
pass
|
||||
if dt_obj is None:
|
||||
# Fallback: warn and use current time
|
||||
print(f" Warning: could not parse Date header: {repr(date_raw)} [key={key}, subject={subject}]")
|
||||
dt_obj = datetime.now()
|
||||
iso_date = dt_obj.isoformat()
|
||||
|
||||
# Parse body and attachments
|
||||
body_text, attachments = parse_email_message(msg)
|
||||
body_text, attachments = parse_email_message(msg, email_id=f"key={key}, subject={subject}")
|
||||
|
||||
attachments_str = ", ".join(attachments) if attachments else "None"
|
||||
vector_text = (
|
||||
|
||||
Reference in New Issue
Block a user