fix: perms
All checks were successful
perso/mcp-maildir/pipeline/head This commit looks good

This commit is contained in:
Julien Cabillot
2026-02-26 19:18:06 -05:00
parent b96277064a
commit 4e45f3e8d9
6 changed files with 388 additions and 22 deletions

38
Jenkinsfile vendored Normal file
View File

@@ -0,0 +1,38 @@
pipeline {
environment {
registry = 'https://registry.hub.docker.com'
registryCredential = 'dockerhub_jcabillot'
dockerImage = 'jcabillot/mcp-maildir'
}
agent any
triggers {
cron('@midnight')
}
stages {
stage('Clone repository') {
steps{
checkout scm
}
}
stage('Build image') {
steps{
sh 'docker build --force-rm=true --no-cache=true --pull -t ${dockerImage} -f pkg/Dockerfile .'
}
}
stage('Deploy Image') {
steps{
script {
withCredentials([usernamePassword(credentialsId: 'dockerhub_jcabillot', usernameVariable: 'DOCKER_USER', passwordVariable: 'DOCKER_PASS')]) {
sh 'docker login --username ${DOCKER_USER} --password ${DOCKER_PASS}'
sh 'docker push ${dockerImage}'
}
}
}
}
}
}

View File

@@ -3,8 +3,8 @@ services:
image: docker.io/qdrant/qdrant:latest image: docker.io/qdrant/qdrant:latest
container_name: mcp_maildir_qdrant container_name: mcp_maildir_qdrant
ports: ports:
- "6333:6333" # REST API - "127.0.0.1:6333:6333" # REST API
- "6334:6334" # gRPC API - "127.0.0.1:6334:6334" # gRPC API
volumes: volumes:
- ./qdrant_storage:/qdrant/storage:z - ./qdrant_storage:/qdrant/storage:z
restart: unless-stopped restart: unless-stopped
@@ -15,7 +15,7 @@ services:
dockerfile: pkg/Dockerfile dockerfile: pkg/Dockerfile
container_name: mcp_maildir_server container_name: mcp_maildir_server
ports: ports:
- "8000:8000" # Expose the MCP HTTP (SSE) server - "127.0.0.1:8000:8000" # Expose the MCP HTTP (SSE) server
env_file: env_file:
- .env - .env
environment: environment:

View File

@@ -1,5 +1,5 @@
# Use Python 3.14 as requested # 3.13, because with 3.14 it requires gcc (wheels not available)
FROM docker.io/library/python:3.14-slim FROM docker.io/library/python:3.13-slim
# Set environment variables # Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONDONTWRITEBYTECODE=1

View File

@@ -1,7 +1,7 @@
mcp mcp
fastmcp fastmcp
qdrant-client qdrant-client
sentence-transformers fastembed
python-dotenv python-dotenv
uvicorn uvicorn
starlette starlette

View File

@@ -6,7 +6,7 @@ import os
import email import email
import mailbox import mailbox
from datetime import datetime from datetime import datetime
from email.utils import parsedate_to_datetime from email.utils import parsedate_to_datetime, parseaddr
from email.header import decode_header from email.header import decode_header
from typing import List, Dict, Any, Tuple from typing import List, Dict, Any, Tuple
import uuid import uuid
@@ -14,7 +14,7 @@ import uuid
from dotenv import load_dotenv from dotenv import load_dotenv
from qdrant_client import QdrantClient from qdrant_client import QdrantClient
from qdrant_client.http import models from qdrant_client.http import models
from sentence_transformers import SentenceTransformer from fastembed import TextEmbedding
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# Load .env config # Load .env config
@@ -32,7 +32,7 @@ if not QDRANT_URL:
if not COLLECTION_NAME: if not COLLECTION_NAME:
raise ValueError("COLLECTION_NAME environment variable is required.") raise ValueError("COLLECTION_NAME environment variable is required.")
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" EMBEDDING_MODEL_NAME = os.environ.get("EMBEDDING_MODEL_NAME", "BAAI/bge-small-en-v1.5")
BATCH_SIZE = 50 BATCH_SIZE = 50
def decode_mime_words(s: str) -> str: def decode_mime_words(s: str) -> str:
@@ -59,6 +59,14 @@ def extract_text_from_html(html_content: str) -> str:
except Exception: except Exception:
return html_content return html_content
def normalize_email_address(value: str) -> str:
"""Extracts and normalizes the bare email address from a header value."""
if not value:
return ""
_, addr = parseaddr(value)
return (addr or value).strip().lower()
def parse_email_message(msg: mailbox.Message) -> Tuple[str, List[str]]: def parse_email_message(msg: mailbox.Message) -> Tuple[str, List[str]]:
"""Extracts plain text body and a list of attachment filenames.""" """Extracts plain text body and a list of attachment filenames."""
body_parts = [] body_parts = []
@@ -99,9 +107,8 @@ def parse_email_message(msg: mailbox.Message) -> Tuple[str, List[str]]:
return "\n".join(body_parts).strip(), attachments return "\n".join(body_parts).strip(), attachments
def init_qdrant_collection(client: QdrantClient, model: SentenceTransformer): def init_qdrant_collection(client: QdrantClient, vector_size: int):
"""Ensures Qdrant collection exists and payload indexes are created.""" """Ensures Qdrant collection exists and payload indexes are created."""
vector_size = model.get_sentence_embedding_dimension()
# Check if collection exists # Check if collection exists
collections = client.get_collections().collections collections = client.get_collections().collections
@@ -151,12 +158,13 @@ def main():
# Initialize model # Initialize model
print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...") print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
model = SentenceTransformer(EMBEDDING_MODEL_NAME) model = TextEmbedding(model_name=EMBEDDING_MODEL_NAME)
vector_size = len(next(iter(model.embed(["dimension_probe"])) ))
# Initialize Qdrant # Initialize Qdrant
print("Connecting to Qdrant...") print("Connecting to Qdrant...")
qdrant_client = QdrantClient(url=QDRANT_URL) qdrant_client = QdrantClient(url=QDRANT_URL)
init_qdrant_collection(qdrant_client, model) init_qdrant_collection(qdrant_client, vector_size)
points = [] points = []
@@ -173,8 +181,10 @@ def main():
try: try:
# Parse headers # Parse headers
subject = decode_mime_words(msg.get("Subject", "No Subject")) subject = decode_mime_words(msg.get("Subject", "No Subject"))
sender = decode_mime_words(msg.get("From", "Unknown")) sender_raw = decode_mime_words(msg.get("From", "Unknown"))
receiver = decode_mime_words(msg.get("To", "Unknown")) receiver_raw = decode_mime_words(msg.get("To", "Unknown"))
sender = normalize_email_address(sender_raw)
receiver = normalize_email_address(receiver_raw)
message_id = msg.get("Message-ID", str(uuid.uuid4())) message_id = msg.get("Message-ID", str(uuid.uuid4()))
# Parse date # Parse date
@@ -207,14 +217,18 @@ def main():
) )
# Embed the text # Embed the text
vector = model.encode(vector_text).tolist() # Fastembed returns an iterable of numpy arrays
embeddings = list(model.embed([vector_text]))
vector = embeddings[0].tolist()
# Prepare payload (metadata) # Prepare payload (metadata)
payload = { payload = {
"message_id": message_id, "message_id": message_id,
"date": iso_date, "date": iso_date,
"sender": sender, "sender": sender,
"sender_raw": sender_raw,
"receiver": receiver, "receiver": receiver,
"receiver_raw": receiver_raw,
"subject": subject, "subject": subject,
"body_text": body_text, "body_text": body_text,
"attachments": attachments "attachments": attachments

View File

@@ -2,31 +2,345 @@
MCP Server exposing search and read tools for the indexed emails. MCP Server exposing search and read tools for the indexed emails.
""" """
import logging
import os import os
from fastmcp import FastMCP from typing import Any, Optional
import qdrant_client as qdrant_pkg
from dateutil import parser as date_parser
from email.utils import parseaddr
from dotenv import load_dotenv from dotenv import load_dotenv
from fastmcp import FastMCP
from qdrant_client import QdrantClient
from qdrant_client.http import models
from fastembed import TextEmbedding
load_dotenv() load_dotenv()
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
)
logger = logging.getLogger(__name__)
# Verbose MCP diagnostics (useful for -32602 / validation flow)
logging.getLogger("mcp").setLevel(logging.DEBUG)
logging.getLogger("mcp.server").setLevel(logging.DEBUG)
logging.getLogger("mcp.server.lowlevel.server").setLevel(logging.DEBUG)
QDRANT_URL = os.environ.get("QDRANT_URL", "")
COLLECTION_NAME = os.environ.get("COLLECTION_NAME", "")
EMBEDDING_MODEL_NAME = os.environ.get("EMBEDDING_MODEL_NAME", "BAAI/bge-small-en-v1.5")
DEFAULT_LIMIT = int(os.environ.get("SEARCH_LIMIT", "50"))
if not QDRANT_URL:
raise ValueError("QDRANT_URL environment variable is required.")
if not COLLECTION_NAME:
raise ValueError("COLLECTION_NAME environment variable is required.")
logger.info(f"Starting MCP server with collection: {COLLECTION_NAME}")
# Initialize FastMCP server # Initialize FastMCP server
mcp = FastMCP("mcp-maildir") mcp = FastMCP("mcp-maildir")
# Lazy singletons
_qdrant_client: Optional[QdrantClient] = None
_embedding_model: Optional[TextEmbedding] = None
def get_qdrant_client() -> QdrantClient:
"""Returns a singleton Qdrant client."""
global _qdrant_client
if _qdrant_client is None:
logger.info(f"Connecting to Qdrant at {QDRANT_URL}")
_qdrant_client = QdrantClient(url=QDRANT_URL)
return _qdrant_client
def get_embedding_model() -> TextEmbedding:
"""Returns a singleton sentence-transformer model."""
global _embedding_model
if _embedding_model is None:
logger.info(f"Loading embedding model: {EMBEDDING_MODEL_NAME}")
_embedding_model = TextEmbedding(model_name=EMBEDDING_MODEL_NAME)
return _embedding_model
def log_qdrant_diagnostics(client: QdrantClient) -> None:
"""Logs Qdrant client capabilities to diagnose API/version mismatch issues."""
version = getattr(qdrant_pkg, "__version__", "unknown")
logger.info(
"Qdrant diagnostics: client_type=%s, qdrant_client_version=%s, has_search=%s, has_query_points=%s, has_scroll=%s",
type(client).__name__,
version,
hasattr(client, "search"),
hasattr(client, "query_points"),
hasattr(client, "scroll"),
)
def normalize_email_address(value: Optional[str]) -> str:
"""Extracts and normalizes the bare email address from a value."""
if not value:
return ""
_, addr = parseaddr(value)
return (addr or value).strip().lower()
def payload_matches_participant(payload: dict[str, Any], participant: str) -> bool:
"""Checks if participant matches either sender or receiver on normalized and raw payload values."""
participant_norm = normalize_email_address(participant)
# Sender checks
payload_sender = normalize_email_address(payload.get("sender"))
payload_sender_raw = normalize_email_address(payload.get("sender_raw"))
payload_sender_text = str(payload.get("sender", "")).strip().lower()
payload_sender_raw_text = str(payload.get("sender_raw", "")).strip().lower()
# Receiver checks
payload_receiver = normalize_email_address(payload.get("receiver"))
payload_receiver_raw = normalize_email_address(payload.get("receiver_raw"))
payload_receiver_text = str(payload.get("receiver", "")).strip().lower()
payload_receiver_raw_text = str(payload.get("receiver_raw", "")).strip().lower()
return participant_norm in {
payload_sender,
payload_sender_raw,
payload_receiver,
payload_receiver_raw,
} or participant_norm in payload_sender_text \
or participant_norm in payload_sender_raw_text \
or participant_norm in payload_receiver_text \
or participant_norm in payload_receiver_raw_text
def build_filter(participant: Optional[str], start_date: Optional[str], end_date: Optional[str]) -> Optional[models.Filter]:
"""Builds Qdrant payload filters from optional parameters."""
conditions: list[models.FieldCondition] = []
if participant:
# Filter for either sender or receiver matching the participant email
normalized_participant = normalize_email_address(participant)
# In Qdrant, to do an OR condition, we use a Should clause within a Filter
participant_filter = models.Filter(
should=[
models.FieldCondition(
key="sender",
match=models.MatchValue(value=normalized_participant),
),
models.FieldCondition(
key="receiver",
match=models.MatchValue(value=normalized_participant),
)
]
)
# We append this compound filter as a requirement
conditions.append(participant_filter)
if start_date or end_date:
gte = date_parser.parse(start_date).isoformat() if start_date else None
lte = date_parser.parse(end_date).isoformat() if end_date else None
conditions.append(
models.FieldCondition(
key="date",
range=models.DatetimeRange(gte=gte, lte=lte),
)
)
return models.Filter(must=conditions) if conditions else None
def format_search_result(point: Any) -> dict[str, Any]:
"""Formats a Qdrant point into a compact response."""
payload = point.payload or {}
return {
"message_id": payload.get("message_id"),
"date": payload.get("date"),
"sender": payload.get("sender"),
"receiver": payload.get("receiver"),
"subject": payload.get("subject"),
"attachments": payload.get("attachments", []),
"score": getattr(point, "score", None),
}
def vector_search_points(
client: QdrantClient,
*,
query_vector: list[float],
query_filter: Optional[models.Filter],
limit: int,
) -> list[Any]:
"""Executes vector search across Qdrant client API variants."""
if hasattr(client, "query_points"):
response = client.query_points(
collection_name=COLLECTION_NAME,
query=query_vector,
query_filter=query_filter,
limit=limit,
with_payload=True,
with_vectors=False,
)
points = getattr(response, "points", None)
if points is None:
if isinstance(response, list):
return response
return []
return points
if hasattr(client, "search"):
return client.search(
collection_name=COLLECTION_NAME,
query_vector=query_vector,
query_filter=query_filter,
limit=limit,
with_payload=True,
with_vectors=False,
)
raise AttributeError("Qdrant client exposes neither 'query_points' nor 'search'.")
@mcp.tool() @mcp.tool()
def search_emails(query: str, sender: str | None = None, start_date: str | None = None, end_date: str | None = None): def search_emails(
query: str,
participant: str = "",
start_date: str = "",
end_date: str = "",
):
""" """
Performs a hybrid search (Semantic + Exact filtering on metadata). Performs a hybrid search (Semantic + Exact filtering on metadata).
""" """
# TODO: Implement Qdrant search logger.info(
return f"Searching for '{query}'..." "Tool search_emails input diagnostics: query=%r(type=%s,len=%d), participant=%r(type=%s), start_date=%r(type=%s), end_date=%r(type=%s)",
query,
type(query).__name__,
len(query) if isinstance(query, str) else -1,
participant,
type(participant).__name__,
start_date,
type(start_date).__name__,
end_date,
type(end_date).__name__,
)
# Convert empty strings to None for internal logic
p_val = normalize_email_address(participant) if participant else None
sd_val = start_date if start_date else None
ed_val = end_date if end_date else None
logger.info(
"Tool search_emails normalized filters: participant=%r, start_date=%r, end_date=%r",
p_val,
sd_val,
ed_val,
)
try:
model = get_embedding_model()
qdrant = get_qdrant_client()
log_qdrant_diagnostics(qdrant)
query_vector = list(model.embed([query]))[0].tolist()
query_filter = build_filter(participant=p_val, start_date=sd_val, end_date=ed_val)
logger.info("Tool search_emails built filter: %s", query_filter)
points = vector_search_points(
qdrant,
query_vector=query_vector,
query_filter=query_filter,
limit=DEFAULT_LIMIT,
)
# Backward-compatible fallback for legacy indexed payloads where participant may
# still be stored as full "Display Name <email@domain>".
if p_val and not points:
logger.info("No result with exact participant filter, trying fallback matching...")
fallback_filter = build_filter(participant=None, start_date=sd_val, end_date=ed_val)
fallback_points = vector_search_points(
qdrant,
query_vector=query_vector,
query_filter=fallback_filter,
limit=max(DEFAULT_LIMIT * 5, 50),
)
points = [
p for p in fallback_points
if payload_matches_participant(p.payload or {}, p_val)
][:DEFAULT_LIMIT]
logger.info(f"Found {len(points)} results")
return {
"query": query,
"filters": {
"participant": participant,
"start_date": start_date,
"end_date": end_date,
},
"count": len(points),
"results": [format_search_result(point) for point in points],
}
except Exception as exc:
logger.error(f"Error in search_emails: {exc}", exc_info=True)
return {"error": f"search_emails failed: {exc}"}
@mcp.tool() @mcp.tool()
def read_email(message_id: str): def read_email(message_id: str):
""" """
Returns the full text content (cleaned of HTML) of a specific email. Returns the full text content (cleaned of HTML) of a specific email.
""" """
# TODO: Implement fetching email by message_id logger.info(
return f"Reading email {message_id}..." "Tool read_email input diagnostics: message_id=%r(type=%s,len=%d)",
message_id,
type(message_id).__name__,
len(message_id) if isinstance(message_id, str) else -1,
)
try:
qdrant = get_qdrant_client()
points, _ = qdrant.scroll(
collection_name=COLLECTION_NAME,
scroll_filter=models.Filter(
must=[
models.FieldCondition(
key="message_id",
match=models.MatchValue(value=message_id),
)
]
),
limit=1,
with_payload=True,
with_vectors=False,
)
if not points:
logger.warning(f"No email found for message_id={message_id}")
return {"error": f"No email found for message_id={message_id}"}
payload = points[0].payload or {}
return {
"message_id": payload.get("message_id"),
"date": payload.get("date"),
"sender": payload.get("sender"),
"receiver": payload.get("receiver"),
"subject": payload.get("subject"),
"attachments": payload.get("attachments", []),
"body_text": payload.get("body_text", ""),
}
except Exception as exc:
logger.error(f"Error in read_email: {exc}", exc_info=True)
return {"error": f"read_email failed: {exc}"}
if __name__ == "__main__": if __name__ == "__main__":
logger.info("Initializing models before starting server...")
try:
get_embedding_model()
logger.info("Models loaded successfully.")
except Exception as e:
logger.error(f"Failed to load models: {e}")
logger.info("Starting SSE server on 0.0.0.0:8000...")
# Start the MCP server using SSE (Server-Sent Events) over HTTP # Start the MCP server using SSE (Server-Sent Events) over HTTP
mcp.run(transport="sse", host="0.0.0.0", port=8000) mcp.run(transport="sse", host="0.0.0.0", port=8000)