12 Practice Lab: Build a Semantic Search Engine

🧪 Lab Overview

Duration: 2–3 hours | Difficulty: ⭐⭐⭐☆☆ (Intermediate) Goal: Build a production-ready semantic search engine over a document corpus.

12.1 What You’ll Build

A Semantic Search Engine that: - Indexes a folder of documents (PDF, TXT, MD) - Supports natural language queries - Shows similarity scores and source highlights - Includes a web interface (optional)

12.2 Step 1: Document Ingestion Pipeline

# file: semantic_search/ingest.py
from pathlib import Path
import chromadb
from chromadb.utils import embedding_functions
from langchain_community.document_loaders import (
    TextLoader, PyPDFLoader, UnstructuredMarkdownLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter

def load_document(file_path: Path):
    """Load a document based on its extension."""
    ext = file_path.suffix.lower()
    loaders = {
        ".txt": TextLoader,
        ".pdf": PyPDFLoader,
        ".md": UnstructuredMarkdownLoader
    }
    loader_class = loaders.get(ext)
    if not loader_class:
        return []
    return loader_class(str(file_path)).load()

def ingest_documents(docs_folder: str, collection_name: str = "documents"):
    """Ingest all documents from a folder into ChromaDB."""

    # Setup
    chroma_client = chromadb.PersistentClient(path="./search_db")
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        model_name="text-embedding-3-small"
    )

    # Delete existing collection if it exists
    try:
        chroma_client.delete_collection(collection_name)
    except:
        pass

    collection = chroma_client.create_collection(
        name=collection_name,
        embedding_function=openai_ef
    )

    # Process documents
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    all_chunks = []
    all_ids = []
    all_metadatas = []

    for file_path in Path(docs_folder).rglob("*"):
        if file_path.is_file() and file_path.suffix in [".txt", ".pdf", ".md"]:
            docs = load_document(file_path)
            chunks = splitter.split_documents(docs)

            for i, chunk in enumerate(chunks):
                all_chunks.append(chunk.page_content)
                all_ids.append(f"{file_path.stem}_{i}")
                all_metadatas.append({
                    "source": str(file_path),
                    "filename": file_path.name,
                    "chunk_index": i
                })

            print(f"✅ Indexed: {file_path.name} ({len(chunks)} chunks)")

    # Batch insert (ChromaDB handles this efficiently)
    batch_size = 100
    for i in range(0, len(all_chunks), batch_size):
        collection.add(
            documents=all_chunks[i:i+batch_size],
            ids=all_ids[i:i+batch_size],
            metadatas=all_metadatas[i:i+batch_size]
        )

    print(f"\n📦 Total: {collection.count()} chunks indexed")
    return collection

if __name__ == "__main__":
    ingest_documents("./documents")

12.3 Step 2: Search Engine

# file: semantic_search/search.py
import chromadb
from chromadb.utils import embedding_functions
from dataclasses import dataclass
from typing import List

@dataclass
class SearchResult:
    content: str
    source: str
    score: float
    chunk_index: int

def get_collection(collection_name: str = "documents"):
    """Connect to existing ChromaDB collection."""
    client = chromadb.PersistentClient(path="./search_db")
    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
        model_name="text-embedding-3-small"
    )
    return client.get_collection(
        name=collection_name,
        embedding_function=openai_ef
    )

def search(
    query: str,
    n_results: int = 5,
    source_filter: str = None
) -> List[SearchResult]:
    """Search the document collection."""

    collection = get_collection()

    where = {"filename": {"$eq": source_filter}} if source_filter else None

    results = collection.query(
        query_texts=[query],
        n_results=n_results,
        where=where,
        include=["documents", "distances", "metadatas"]
    )

    search_results = []
    for doc, distance, metadata in zip(
        results["documents"][0],
        results["distances"][0],
        results["metadatas"][0]
    ):
        search_results.append(SearchResult(
            content=doc,
            source=metadata.get("filename", "Unknown"),
            score=1 - distance,  # Convert distance to similarity
            chunk_index=metadata.get("chunk_index", 0)
        ))

    return search_results

def interactive_search():
    """CLI for interactive search."""
    print("🔍 Semantic Search Engine")
    print("Type 'quit' to exit\n")

    while True:
        query = input("Search: ").strip()
        if query.lower() == "quit":
            break
        if not query:
            continue

        results = search(query, n_results=3)

        print(f"\nTop {len(results)} results for: '{query}'")
        print("=" * 60)
        for i, r in enumerate(results, 1):
            print(f"\n{i}. [{r.score:.3f}] {r.source}")
            print(f"   {r.content[:200]}...")

if __name__ == "__main__":
    interactive_search()

12.4 Step 3: AI-Enhanced Search (RAG Preview)

Add an LLM to synthesise search results into a coherent answer:

from openai import OpenAI

client = OpenAI()

def search_and_answer(question: str) -> str:
    """Search documents and generate an answer."""
    results = search(question, n_results=4)

    context = "\n\n---\n\n".join([
        f"Source: {r.source}\n{r.content}"
        for r in results
    ])

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content":
             "Answer questions using only the provided context. "
             "Cite sources. If the answer isn't in the context, say so."},
            {"role": "user", "content":
             f"Context:\n{context}\n\nQuestion: {question}"}
        ],
        temperature=0
    )

    return response.choices[0].message.content

# Test it
answer = search_and_answer("What are the main applications of embeddings?")
print(answer)

12.5 Lab Challenges 🏆

Easy: Add a --limit-source flag to restrict search to one file
Medium: Build a Streamlit UI for the search engine
Hard: Implement hybrid search (combine semantic + keyword/BM25 scores)

✅ Lab Complete!

Excellent work! You’ve built a real semantic search engine. Up next: RAG — the technique that powers enterprise AI assistants.

# Practice Lab: Build a Semantic Search Engine {#sec-lab-search} ::: {.callout-important icon="false"} ## 🧪 Lab Overview **Duration:** 2–3 hours | **Difficulty:** ⭐⭐⭐☆☆ (Intermediate) **Goal:** Build a production-ready semantic search engine over a document corpus. ::: ## What You'll Build A **Semantic Search Engine** that: - Indexes a folder of documents (PDF, TXT, MD) - Supports natural language queries - Shows similarity scores and source highlights - Includes a web interface (optional) --- ## Step 1: Document Ingestion Pipeline ```python # file: semantic_search/ingest.py from pathlib import Path import chromadb from chromadb.utils import embedding_functions from langchain_community.document_loaders import ( TextLoader, PyPDFLoader, UnstructuredMarkdownLoader ) from langchain_text_splitters import RecursiveCharacterTextSplitter def load_document(file_path: Path): """Load a document based on its extension.""" ext = file_path.suffix.lower() loaders = { ".txt": TextLoader, ".pdf": PyPDFLoader, ".md": UnstructuredMarkdownLoader } loader_class = loaders.get(ext) if not loader_class: return [] return loader_class(str(file_path)).load() def ingest_documents(docs_folder: str, collection_name: str = "documents"): """Ingest all documents from a folder into ChromaDB.""" # Setup chroma_client = chromadb.PersistentClient(path="./search_db") openai_ef = embedding_functions.OpenAIEmbeddingFunction( model_name="text-embedding-3-small" ) # Delete existing collection if it exists try: chroma_client.delete_collection(collection_name) except: pass collection = chroma_client.create_collection( name=collection_name, embedding_function=openai_ef ) # Process documents splitter = RecursiveCharacterTextSplitter( chunk_size=800, chunk_overlap=100, separators=["\n\n", "\n", ". ", " ", ""] ) all_chunks = [] all_ids = [] all_metadatas = [] for file_path in Path(docs_folder).rglob("*"): if file_path.is_file() and file_path.suffix in [".txt", ".pdf", ".md"]: docs = load_document(file_path) chunks = splitter.split_documents(docs) for i, chunk in enumerate(chunks): all_chunks.append(chunk.page_content) all_ids.append(f"{file_path.stem}_{i}") all_metadatas.append({ "source": str(file_path), "filename": file_path.name, "chunk_index": i }) print(f"✅ Indexed: {file_path.name} ({len(chunks)} chunks)") # Batch insert (ChromaDB handles this efficiently) batch_size = 100 for i in range(0, len(all_chunks), batch_size): collection.add( documents=all_chunks[i:i+batch_size], ids=all_ids[i:i+batch_size], metadatas=all_metadatas[i:i+batch_size] ) print(f"\n📦 Total: {collection.count()} chunks indexed") return collection if __name__ == "__main__": ingest_documents("./documents") ``` --- ## Step 2: Search Engine ```python # file: semantic_search/search.py import chromadb from chromadb.utils import embedding_functions from dataclasses import dataclass from typing import List @dataclass class SearchResult: content: str source: str score: float chunk_index: int def get_collection(collection_name: str = "documents"): """Connect to existing ChromaDB collection.""" client = chromadb.PersistentClient(path="./search_db") openai_ef = embedding_functions.OpenAIEmbeddingFunction( model_name="text-embedding-3-small" ) return client.get_collection( name=collection_name, embedding_function=openai_ef ) def search( query: str, n_results: int = 5, source_filter: str = None ) -> List[SearchResult]: """Search the document collection.""" collection = get_collection() where = {"filename": {"$eq": source_filter}} if source_filter else None results = collection.query( query_texts=[query], n_results=n_results, where=where, include=["documents", "distances", "metadatas"] ) search_results = [] for doc, distance, metadata in zip( results["documents"][0], results["distances"][0], results["metadatas"][0] ): search_results.append(SearchResult( content=doc, source=metadata.get("filename", "Unknown"), score=1 - distance, # Convert distance to similarity chunk_index=metadata.get("chunk_index", 0) )) return search_results def interactive_search(): """CLI for interactive search.""" print("🔍 Semantic Search Engine") print("Type 'quit' to exit\n") while True: query = input("Search: ").strip() if query.lower() == "quit": break if not query: continue results = search(query, n_results=3) print(f"\nTop {len(results)} results for: '{query}'") print("=" * 60) for i, r in enumerate(results, 1): print(f"\n{i}. [{r.score:.3f}] {r.source}") print(f" {r.content[:200]}...") if __name__ == "__main__": interactive_search() ``` --- ## Step 3: AI-Enhanced Search (RAG Preview) Add an LLM to synthesise search results into a coherent answer: ```python from openai import OpenAI client = OpenAI() def search_and_answer(question: str) -> str: """Search documents and generate an answer.""" results = search(question, n_results=4) context = "\n\n---\n\n".join([ f"Source: {r.source}\n{r.content}" for r in results ]) response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": "Answer questions using only the provided context. " "Cite sources. If the answer isn't in the context, say so."}, {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"} ], temperature=0 ) return response.choices[0].message.content # Test it answer = search_and_answer("What are the main applications of embeddings?") print(answer) ``` --- ## Lab Challenges 🏆 1. **Easy**: Add a `--limit-source` flag to restrict search to one file 2. **Medium**: Build a Streamlit UI for the search engine 3. **Hard**: Implement hybrid search (combine semantic + keyword/BM25 scores) --- ::: {.callout-note icon="false"} ## ✅ Lab Complete! Excellent work! You've built a real semantic search engine. Up next: **RAG** — the technique that powers enterprise AI assistants. :::