12 Practice Lab: Build a Semantic Search Engine
12.1 What You’ll Build
A Semantic Search Engine that: - Indexes a folder of documents (PDF, TXT, MD) - Supports natural language queries - Shows similarity scores and source highlights - Includes a web interface (optional)
12.2 Step 1: Document Ingestion Pipeline
# file: semantic_search/ingest.py
from pathlib import Path
import chromadb
from chromadb.utils import embedding_functions
from langchain_community.document_loaders import (
TextLoader, PyPDFLoader, UnstructuredMarkdownLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter
def load_document(file_path: Path):
"""Load a document based on its extension."""
ext = file_path.suffix.lower()
loaders = {
".txt": TextLoader,
".pdf": PyPDFLoader,
".md": UnstructuredMarkdownLoader
}
loader_class = loaders.get(ext)
if not loader_class:
return []
return loader_class(str(file_path)).load()
def ingest_documents(docs_folder: str, collection_name: str = "documents"):
"""Ingest all documents from a folder into ChromaDB."""
# Setup
chroma_client = chromadb.PersistentClient(path="./search_db")
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
model_name="text-embedding-3-small"
)
# Delete existing collection if it exists
try:
chroma_client.delete_collection(collection_name)
except:
pass
collection = chroma_client.create_collection(
name=collection_name,
embedding_function=openai_ef
)
# Process documents
splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=100,
separators=["\n\n", "\n", ". ", " ", ""]
)
all_chunks = []
all_ids = []
all_metadatas = []
for file_path in Path(docs_folder).rglob("*"):
if file_path.is_file() and file_path.suffix in [".txt", ".pdf", ".md"]:
docs = load_document(file_path)
chunks = splitter.split_documents(docs)
for i, chunk in enumerate(chunks):
all_chunks.append(chunk.page_content)
all_ids.append(f"{file_path.stem}_{i}")
all_metadatas.append({
"source": str(file_path),
"filename": file_path.name,
"chunk_index": i
})
print(f"✅ Indexed: {file_path.name} ({len(chunks)} chunks)")
# Batch insert (ChromaDB handles this efficiently)
batch_size = 100
for i in range(0, len(all_chunks), batch_size):
collection.add(
documents=all_chunks[i:i+batch_size],
ids=all_ids[i:i+batch_size],
metadatas=all_metadatas[i:i+batch_size]
)
print(f"\n📦 Total: {collection.count()} chunks indexed")
return collection
if __name__ == "__main__":
ingest_documents("./documents")12.3 Step 2: Search Engine
# file: semantic_search/search.py
import chromadb
from chromadb.utils import embedding_functions
from dataclasses import dataclass
from typing import List
@dataclass
class SearchResult:
content: str
source: str
score: float
chunk_index: int
def get_collection(collection_name: str = "documents"):
"""Connect to existing ChromaDB collection."""
client = chromadb.PersistentClient(path="./search_db")
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
model_name="text-embedding-3-small"
)
return client.get_collection(
name=collection_name,
embedding_function=openai_ef
)
def search(
query: str,
n_results: int = 5,
source_filter: str = None
) -> List[SearchResult]:
"""Search the document collection."""
collection = get_collection()
where = {"filename": {"$eq": source_filter}} if source_filter else None
results = collection.query(
query_texts=[query],
n_results=n_results,
where=where,
include=["documents", "distances", "metadatas"]
)
search_results = []
for doc, distance, metadata in zip(
results["documents"][0],
results["distances"][0],
results["metadatas"][0]
):
search_results.append(SearchResult(
content=doc,
source=metadata.get("filename", "Unknown"),
score=1 - distance, # Convert distance to similarity
chunk_index=metadata.get("chunk_index", 0)
))
return search_results
def interactive_search():
"""CLI for interactive search."""
print("🔍 Semantic Search Engine")
print("Type 'quit' to exit\n")
while True:
query = input("Search: ").strip()
if query.lower() == "quit":
break
if not query:
continue
results = search(query, n_results=3)
print(f"\nTop {len(results)} results for: '{query}'")
print("=" * 60)
for i, r in enumerate(results, 1):
print(f"\n{i}. [{r.score:.3f}] {r.source}")
print(f" {r.content[:200]}...")
if __name__ == "__main__":
interactive_search()12.4 Step 3: AI-Enhanced Search (RAG Preview)
Add an LLM to synthesise search results into a coherent answer:
from openai import OpenAI
client = OpenAI()
def search_and_answer(question: str) -> str:
"""Search documents and generate an answer."""
results = search(question, n_results=4)
context = "\n\n---\n\n".join([
f"Source: {r.source}\n{r.content}"
for r in results
])
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content":
"Answer questions using only the provided context. "
"Cite sources. If the answer isn't in the context, say so."},
{"role": "user", "content":
f"Context:\n{context}\n\nQuestion: {question}"}
],
temperature=0
)
return response.choices[0].message.content
# Test it
answer = search_and_answer("What are the main applications of embeddings?")
print(answer)12.5 Lab Challenges 🏆
- Easy: Add a
--limit-sourceflag to restrict search to one file - Medium: Build a Streamlit UI for the search engine
- Hard: Implement hybrid search (combine semantic + keyword/BM25 scores)