From b96277064a6712762488eb8517fb79cbe02ef30b Mon Sep 17 00:00:00 2001 From: Julien Cabillot Date: Thu, 26 Feb 2026 13:55:13 -0500 Subject: [PATCH] feat: init --- .env.example | 3 + .gitignore | 28 +++++ README.md | 81 +++++++++++++++ docker-compose.yml | 31 ++++++ pkg/Dockerfile | 21 ++++ requirements.txt | 9 ++ src/indexer.py | 254 +++++++++++++++++++++++++++++++++++++++++++++ src/server.py | 32 ++++++ 8 files changed, 459 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 README.md create mode 100644 docker-compose.yml create mode 100644 pkg/Dockerfile create mode 100644 requirements.txt create mode 100644 src/indexer.py create mode 100644 src/server.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..c40defb --- /dev/null +++ b/.env.example @@ -0,0 +1,3 @@ +MAILDIR_PATH=/path/to/your/maildir/dump +QDRANT_URL=http://localhost:6333 +COLLECTION_NAME=my_emails diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ce36e7c --- /dev/null +++ b/.gitignore @@ -0,0 +1,28 @@ +# Virtual environments +venv/ +.venv/ +env/ +env.bak/ + +# Environment variables +.env +.env.local + +# Python cache and compiled files +__pycache__/ +*.py[cod] +*$py.class +*.so + +# Distribution / packaging +build/ +dist/ +*.egg-info/ +.eggs/ + +# Vector database local storage +qdrant_storage/ + +# OS generated files +.DS_Store +Thumbs.db diff --git a/README.md b/README.md new file mode 100644 index 0000000..61c86c1 --- /dev/null +++ b/README.md @@ -0,0 +1,81 @@ +# mcp-maildir + +**mcp-maildir** is an MCP (Model Context Protocol) server allowing AI agents (Claude, OpenHands, Cursor, etc.) to explore, search, and read your email archives in a **100% offline, local, and strict Read-Only (R/O)** manner. + +This project uses an email dump in the `Maildir` format (e.g., generated via `offlineimap`), vectorizes the content locally for semantic search, and stores everything in **Qdrant** to provide ultra-fast hybrid search (Semantic + Exact filtering on metadata like dates or senders). + +## ✨ Features + +* 🔒 **Strict Read-Only**: The AI interacts with a Qdrant index; it has no direct access to your actual mailbox or the `maildir` files. Zero risk of accidental deletion or sending. +* 🧠 **Hybrid Search**: + * *Semantic*: Find concepts ("farewell party organization") via local `sentence-transformers`. + * *Factual*: Filter deterministically by sender or exact date ranges (thanks to Qdrant's native payload indexes). +* 🚀 **MCP Standard**: Instantly compatible with any client supporting the Model Context Protocol. + +## 🏗️ Architecture + +1. **Source**: Your local `Maildir` folder. +2. **Indexer (`indexer.py`)**: A Python script that parses emails, extracts raw text, generates local embeddings, and pushes everything to Qdrant along with metadata. +3. **Database**: Qdrant (running locally via Docker). +4. **MCP Server (`server.py`)**: Exposes search and read tools to the AI agent via `FastMCP`. + +## 🛠️ Prerequisites + +* Python 3.10+ +* Docker (to run Qdrant) +* An email folder in `Maildir` format + +## 🚀 Installation & Setup + +### 1. Clone and prepare the environment + +```bash +git clone https://github.com/your-username/mcp-maildir.git +cd mcp-maildir + +# Create a virtual environment +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install dependencies +pip install mcp fastmcp qdrant-client sentence-transformers +``` + +### 2. Start Qdrant (Vector Database) + +Run Qdrant locally in the background using Docker: + +```sh +docker run -p 6333:6333 -p 6334:6334 \ + -v $(pwd)/qdrant_storage:/qdrant/storage:z \ + qdrant/qdrant +``` + +### 3. Configuration + +Create a .env file or modify the variables in the code to point to your Maildir folder: + +``` +MAILDIR_PATH=/path/to/your/maildir/dump +QDRANT_URL= +COLLECTION_NAME=my_emails +``` + +### 4. Initial Indexing (Ingestion) + +Before the AI can search, you need to index your emails. Run the ingestion script (to be executed every time you sync new emails): + +```sh +python indexer.py +``` + +Note: The indexer.py script automatically configures Qdrant payload indexes for metadata (sender as KEYWORD, date as DATETIME) to guarantee fast and deterministic static searches. + +### 🤖 Usage with an MCP Client + +Tools exposed by the server + +The server.py script exposes the following tools to the AI: + +* search_emails(query: str, sender: str, start_date: str, end_date: str): Performs a hybrid search. Metadata parameters are optional. +* read_email(message_id: str): Returns the full text content (cleaned of HTML) of a specific email. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..04dff4f --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,31 @@ +services: + qdrant: + image: docker.io/qdrant/qdrant:latest + container_name: mcp_maildir_qdrant + ports: + - "6333:6333" # REST API + - "6334:6334" # gRPC API + volumes: + - ./qdrant_storage:/qdrant/storage:z + restart: unless-stopped + + mcp-server: + build: + context: . + dockerfile: pkg/Dockerfile + container_name: mcp_maildir_server + ports: + - "8000:8000" # Expose the MCP HTTP (SSE) server + env_file: + - .env + environment: + # Override Qdrant URL to point to the docker-compose service + - QDRANT_URL=http://qdrant:6333 + volumes: + # Mount the source code for hot-reloading (optional) + - ./src:/app/src:ro,z + # Mount the maildir dump as read-only. + # Ensure you set MAILDIR_PATH in your .env file + - ${MAILDIR_PATH:-./maildir_dump}:/path/to/your/maildir/dump:ro,z + depends_on: + - qdrant diff --git a/pkg/Dockerfile b/pkg/Dockerfile new file mode 100644 index 0000000..5a4dae7 --- /dev/null +++ b/pkg/Dockerfile @@ -0,0 +1,21 @@ +# Use Python 3.14 as requested +FROM docker.io/library/python:3.14-slim + +# Set environment variables +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 + +# Set the working directory in the container +WORKDIR /app + +# Copy the requirements file into the container +COPY requirements.txt . + +# Install dependencies using buildkit cache +RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt + +# Copy the source code +COPY src/ ./src/ + +# Command to run the MCP server +CMD ["python", "src/server.py"] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cedd5e9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +mcp +fastmcp +qdrant-client +sentence-transformers +python-dotenv +uvicorn +starlette +beautifulsoup4 +python-dateutil diff --git a/src/indexer.py b/src/indexer.py new file mode 100644 index 0000000..4d31962 --- /dev/null +++ b/src/indexer.py @@ -0,0 +1,254 @@ +""" +Indexer script to parse emails from Maildir and push them to Qdrant. +""" + +import os +import email +import mailbox +from datetime import datetime +from email.utils import parsedate_to_datetime +from email.header import decode_header +from typing import List, Dict, Any, Tuple +import uuid + +from dotenv import load_dotenv +from qdrant_client import QdrantClient +from qdrant_client.http import models +from sentence_transformers import SentenceTransformer +from bs4 import BeautifulSoup + +# Load .env config +load_dotenv() + +# Configuration +MAILDIR_PATH = os.environ.get("MAILDIR_PATH", "") +QDRANT_URL = os.environ.get("QDRANT_URL", "") +COLLECTION_NAME = os.environ.get("COLLECTION_NAME", "") + +if not MAILDIR_PATH: + raise ValueError("MAILDIR_PATH environment variable is required.") +if not QDRANT_URL: + raise ValueError("QDRANT_URL environment variable is required.") +if not COLLECTION_NAME: + raise ValueError("COLLECTION_NAME environment variable is required.") + +EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" +BATCH_SIZE = 50 + +def decode_mime_words(s: str) -> str: + """Decodes MIME encoded strings (e.g. subjects, filenames).""" + if not s: + return "" + decoded_words = decode_header(s) + result = [] + for word, encoding in decoded_words: + if isinstance(word, bytes): + try: + result.append(word.decode(encoding or 'utf-8', errors='replace')) + except LookupError: + result.append(word.decode('utf-8', errors='replace')) + else: + result.append(word) + return "".join(result) + +def extract_text_from_html(html_content: str) -> str: + """Extracts plain text from HTML content.""" + try: + soup = BeautifulSoup(html_content, "html.parser") + return soup.get_text(separator=" ", strip=True) + except Exception: + return html_content + +def parse_email_message(msg: mailbox.Message) -> Tuple[str, List[str]]: + """Extracts plain text body and a list of attachment filenames.""" + body_parts = [] + attachments = [] + + for part in msg.walk(): + # Skip multiparts, we only care about leaf nodes + if part.is_multipart(): + continue + + content_type = part.get_content_type() + content_disposition = str(part.get("Content-Disposition", "")) + + # Check for attachments + if "attachment" in content_disposition or part.get_filename(): + filename = part.get_filename() + if filename: + attachments.append(decode_mime_words(filename)) + continue + + # Extract text body + if content_type in ["text/plain", "text/html"]: + try: + payload = part.get_payload(decode=True) + if payload: + charset = part.get_content_charset('utf-8') or 'utf-8' + if isinstance(payload, bytes): + text = payload.decode(charset, errors='replace') + else: + text = str(payload) + + if content_type == "text/html": + text = extract_text_from_html(text) + body_parts.append(text) + except Exception as e: + print(f"Error extracting payload: {e}") + pass + + return "\n".join(body_parts).strip(), attachments + +def init_qdrant_collection(client: QdrantClient, model: SentenceTransformer): + """Ensures Qdrant collection exists and payload indexes are created.""" + vector_size = model.get_sentence_embedding_dimension() + + # Check if collection exists + collections = client.get_collections().collections + if not any(c.name == COLLECTION_NAME for c in collections): + print(f"Creating collection '{COLLECTION_NAME}' with vector size {vector_size}...") + client.create_collection( + collection_name=COLLECTION_NAME, + vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE), + ) + else: + print(f"Collection '{COLLECTION_NAME}' already exists.") + + # Create payload indexes for filtering metadata deterministically + print("Ensuring payload indexes exist...") + + # Date index (DATETIME) + client.create_payload_index( + collection_name=COLLECTION_NAME, + field_name="date", + field_schema=models.PayloadSchemaType.DATETIME, + ) + + # Sender index (KEYWORD) + client.create_payload_index( + collection_name=COLLECTION_NAME, + field_name="sender", + field_schema=models.PayloadSchemaType.KEYWORD, + ) + + # Receiver index (KEYWORD) + client.create_payload_index( + collection_name=COLLECTION_NAME, + field_name="receiver", + field_schema=models.PayloadSchemaType.KEYWORD, + ) + +def main(): + """ + Main ingestion function. + Reads Maildir, extracts text, generates local embeddings, and pushes to Qdrant. + """ + print(f"Indexing emails from {MAILDIR_PATH} into {QDRANT_URL}...") + + if not os.path.exists(MAILDIR_PATH): + print(f"Error: Maildir path not found: {MAILDIR_PATH}") + return + + # Initialize model + print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...") + model = SentenceTransformer(EMBEDDING_MODEL_NAME) + + # Initialize Qdrant + print("Connecting to Qdrant...") + qdrant_client = QdrantClient(url=QDRANT_URL) + init_qdrant_collection(qdrant_client, model) + + points = [] + + # Iterate and parse over all maildir directories found in MAILDIR_PATH + for root, dirs, files in os.walk(MAILDIR_PATH): + # A valid Maildir has 'cur', 'new', and 'tmp' subdirectories + if all(subdir in dirs for subdir in ['cur', 'new', 'tmp']): + print(f"Processing Maildir found at: {root}") + mbox = mailbox.Maildir(root) + total_emails_in_dir = len(mbox) + print(f"Found {total_emails_in_dir} emails in this directory.") + + for idx, (key, msg) in enumerate(mbox.items()): + try: + # Parse headers + subject = decode_mime_words(msg.get("Subject", "No Subject")) + sender = decode_mime_words(msg.get("From", "Unknown")) + receiver = decode_mime_words(msg.get("To", "Unknown")) + message_id = msg.get("Message-ID", str(uuid.uuid4())) + + # Parse date + date_str = msg.get("Date") + dt_obj = None + if date_str: + try: + dt_obj = parsedate_to_datetime(date_str) + except Exception: + pass + + if dt_obj is None: + dt_obj = datetime.now() + + # Format to ISO 8601 for Qdrant DATETIME index + iso_date = dt_obj.isoformat() + + # Parse Body and Attachments + body_text, attachments = parse_email_message(msg) + + # Prepare Vector text + attachments_str = ", ".join(attachments) if attachments else "None" + vector_text = ( + f"Date: {iso_date}\n" + f"From: {sender}\n" + f"To: {receiver}\n" + f"Subject: {subject}\n\n" + f"{body_text}\n\n" + f"Attachments: {attachments_str}" + ) + + # Embed the text + vector = model.encode(vector_text).tolist() + + # Prepare payload (metadata) + payload = { + "message_id": message_id, + "date": iso_date, + "sender": sender, + "receiver": receiver, + "subject": subject, + "body_text": body_text, + "attachments": attachments + } + + # Assign deterministic UUID point ID based on message_id + point_id = str(uuid.uuid5(uuid.NAMESPACE_OID, message_id)) + + points.append(models.PointStruct( + id=point_id, + vector=vector, + payload=payload + )) + + # Push in batches + if len(points) >= BATCH_SIZE: + qdrant_client.upsert( + collection_name=COLLECTION_NAME, + points=points + ) + print(f"Processed {idx + 1}/{total_emails_in_dir} emails in current directory...") + points = [] + + except Exception as e: + print(f"Error processing email key={key}: {e}") + + # Push remaining points + if points: + qdrant_client.upsert( + collection_name=COLLECTION_NAME, + points=points + ) + + print("Indexing completed successfully!") + +if __name__ == "__main__": + main() diff --git a/src/server.py b/src/server.py new file mode 100644 index 0000000..f8ee6f6 --- /dev/null +++ b/src/server.py @@ -0,0 +1,32 @@ +""" +MCP Server exposing search and read tools for the indexed emails. +""" + +import os +from fastmcp import FastMCP +from dotenv import load_dotenv + +load_dotenv() + +# Initialize FastMCP server +mcp = FastMCP("mcp-maildir") + +@mcp.tool() +def search_emails(query: str, sender: str | None = None, start_date: str | None = None, end_date: str | None = None): + """ + Performs a hybrid search (Semantic + Exact filtering on metadata). + """ + # TODO: Implement Qdrant search + return f"Searching for '{query}'..." + +@mcp.tool() +def read_email(message_id: str): + """ + Returns the full text content (cleaned of HTML) of a specific email. + """ + # TODO: Implement fetching email by message_id + return f"Reading email {message_id}..." + +if __name__ == "__main__": + # Start the MCP server using SSE (Server-Sent Events) over HTTP + mcp.run(transport="sse", host="0.0.0.0", port=8000)