12  Practice Lab: Build a Semantic Search Engine

Important🧪 Lab Overview

Duration: 2–3 hours | Difficulty: ⭐⭐⭐☆☆ (Intermediate) Goal: Build a production-ready semantic search engine over a document corpus.

12.1 What You’ll Build

A Semantic Search Engine that: - Indexes a folder of documents (PDF, TXT, MD) - Supports natural language queries - Shows similarity scores and source highlights - Includes a web interface (optional)


12.2 Step 1: Document Ingestion Pipeline

# file: semantic_search/ingest.py
from pathlib import Path
import chromadb
from chromadb.utils import embedding_functions
from langchain_community.document_loaders import (
    TextLoader, PyPDFLoader, UnstructuredMarkdownLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter

def load_document(file_path: Path):
    """Load a document based on its extension."""
    ext = file_path.suffix.lower()
    loaders = {
        ".txt": TextLoader,
        ".pdf": PyPDFLoader,
        ".md": UnstructuredMarkdownLoader
    }
    loader_class = loaders.get(ext)
    if not loader_class:
        return []
    return loader_class(str(file_path)).load()

def ingest_documents(docs_folder: str, collection_name: str = "documents"):
    """Ingest all documents from a folder into ChromaDB."""

    # Setup
    chroma_client = chromadb.PersistentClient(path="./search_db")
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        model_name="text-embedding-3-small"
    )

    # Delete existing collection if it exists
    try:
        chroma_client.delete_collection(collection_name)
    except:
        pass

    collection = chroma_client.create_collection(
        name=collection_name,
        embedding_function=openai_ef
    )

    # Process documents
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    all_chunks = []
    all_ids = []
    all_metadatas = []

    for file_path in Path(docs_folder).rglob("*"):
        if file_path.is_file() and file_path.suffix in [".txt", ".pdf", ".md"]:
            docs = load_document(file_path)
            chunks = splitter.split_documents(docs)

            for i, chunk in enumerate(chunks):
                all_chunks.append(chunk.page_content)
                all_ids.append(f"{file_path.stem}_{i}")
                all_metadatas.append({
                    "source": str(file_path),
                    "filename": file_path.name,
                    "chunk_index": i
                })

            print(f"✅ Indexed: {file_path.name} ({len(chunks)} chunks)")

    # Batch insert (ChromaDB handles this efficiently)
    batch_size = 100
    for i in range(0, len(all_chunks), batch_size):
        collection.add(
            documents=all_chunks[i:i+batch_size],
            ids=all_ids[i:i+batch_size],
            metadatas=all_metadatas[i:i+batch_size]
        )

    print(f"\n📦 Total: {collection.count()} chunks indexed")
    return collection

if __name__ == "__main__":
    ingest_documents("./documents")

12.3 Step 2: Search Engine

# file: semantic_search/search.py
import chromadb
from chromadb.utils import embedding_functions
from dataclasses import dataclass
from typing import List

@dataclass
class SearchResult:
    content: str
    source: str
    score: float
    chunk_index: int

def get_collection(collection_name: str = "documents"):
    """Connect to existing ChromaDB collection."""
    client = chromadb.PersistentClient(path="./search_db")
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        model_name="text-embedding-3-small"
    )
    return client.get_collection(
        name=collection_name,
        embedding_function=openai_ef
    )

def search(
    query: str,
    n_results: int = 5,
    source_filter: str = None
) -> List[SearchResult]:
    """Search the document collection."""

    collection = get_collection()

    where = {"filename": {"$eq": source_filter}} if source_filter else None

    results = collection.query(
        query_texts=[query],
        n_results=n_results,
        where=where,
        include=["documents", "distances", "metadatas"]
    )

    search_results = []
    for doc, distance, metadata in zip(
        results["documents"][0],
        results["distances"][0],
        results["metadatas"][0]
    ):
        search_results.append(SearchResult(
            content=doc,
            source=metadata.get("filename", "Unknown"),
            score=1 - distance,  # Convert distance to similarity
            chunk_index=metadata.get("chunk_index", 0)
        ))

    return search_results

def interactive_search():
    """CLI for interactive search."""
    print("🔍 Semantic Search Engine")
    print("Type 'quit' to exit\n")

    while True:
        query = input("Search: ").strip()
        if query.lower() == "quit":
            break
        if not query:
            continue

        results = search(query, n_results=3)

        print(f"\nTop {len(results)} results for: '{query}'")
        print("=" * 60)
        for i, r in enumerate(results, 1):
            print(f"\n{i}. [{r.score:.3f}] {r.source}")
            print(f"   {r.content[:200]}...")

if __name__ == "__main__":
    interactive_search()

12.4 Step 3: AI-Enhanced Search (RAG Preview)

Add an LLM to synthesise search results into a coherent answer:

from openai import OpenAI

client = OpenAI()

def search_and_answer(question: str) -> str:
    """Search documents and generate an answer."""
    results = search(question, n_results=4)

    context = "\n\n---\n\n".join([
        f"Source: {r.source}\n{r.content}"
        for r in results
    ])

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content":
             "Answer questions using only the provided context. "
             "Cite sources. If the answer isn't in the context, say so."},
            {"role": "user", "content":
             f"Context:\n{context}\n\nQuestion: {question}"}
        ],
        temperature=0
    )

    return response.choices[0].message.content

# Test it
answer = search_and_answer("What are the main applications of embeddings?")
print(answer)

12.5 Lab Challenges 🏆

  1. Easy: Add a --limit-source flag to restrict search to one file
  2. Medium: Build a Streamlit UI for the search engine
  3. Hard: Implement hybrid search (combine semantic + keyword/BM25 scores)

Note✅ Lab Complete!

Excellent work! You’ve built a real semantic search engine. Up next: RAG — the technique that powers enterprise AI assistants.