def normalize_ws(s: str) -> str:
return re.sub(r"s+", " ", s).strip()
RAW_DOCS = [
{
"doc_id": "trulens_core",
"title": "TruLens core idea",
"text": "TruLens is used to track and evaluate LLM applications. It can log app runs, compute feedback scores, and provide a dashboard to compare versions and investigate traces and results."
},
{
"doc_id": "trulens_feedback",
"title": "Feedback functions",
"text": "TruLens feedback functions can score groundedness, context relevance, and answer relevance. They are configured by specifying which parts of an app record should be used as inputs."
},
{
"doc_id": "trulens_rag",
"title": "RAG workflow",
"text": "A typical RAG system retrieves relevant chunks from a vector database and then generates an answer using those chunks as context. The quality depends on retrieval, prompt design, and generation behavior."
},
{
"doc_id": "trulens_instrumentation",
"title": "Instrumentation",
"text": "Instrumentation adds tracing spans to your app functions (like retrieval and generation). This makes it possible to analyze which contexts were retrieved, latency, token usage, and connect feedback evaluations to specific steps."
},
{
"doc_id": "vectorstores",
"title": "Vector stores and embeddings",
"text": "Vector stores index embeddings for text chunks, enabling semantic search. OpenAI embedding models can be used to embed chunks and queries, and Chroma can store them locally in memory for a notebook demo."
},
{
"doc_id": "prompting",
"title": "Prompting and citations",
"text": "Prompting can encourage careful, citation-grounded answers. A stronger prompt can enforce: answer only from context, be explicit about uncertainty, and provide short citations that map to retrieved chunks."
},
]
@dataclass
class Chunk:
chunk_id: str
doc_id: str
title: str
textual content: str
meta: Dict[str, Any]
def chunk_docs(docs, chunk_size=350, overlap=80) -> Checklist[Chunk]:
chunks: Checklist[Chunk] = []
for d in docs:
textual content = normalize_ws(d["text"])
begin = 0
idx = 0
whereas begin < len(textual content):
finish = min(len(textual content), begin + chunk_size)
chunk_text = textual content[start:end]
chunk_id = f'{d["doc_id"]}_c{idx}'
chunks.append(
Chunk(
chunk_id=chunk_id,
doc_id=d["doc_id"],
title=d["title"],
textual content=chunk_text,
meta={"doc_id": d["doc_id"], "title": d["title"], "chunk_index": idx},
)
)
idx += 1
begin = finish - overlap
if begin < 0:
begin = 0
if finish == len(textual content):
break
return chunks
CHUNKS = chunk_docs(RAW_DOCS)
