Skip to main content

Overview

Once you’ve generated a PageIndex tree structure, you need effective strategies to search and retrieve relevant sections. Unlike traditional vector-based RAG (Retrieval Augmented Generation), PageIndex leverages hierarchical structure for more targeted retrieval.
  • Structural Awareness: Understand document hierarchy and relationships
  • Explainable Results: Clear reasoning for why sections were selected
  • Context Preservation: Retrieve parent/child sections together when needed
  • Expert Knowledge Integration: Easily incorporate domain-specific search preferences
  • No Embedding Drift: Structure is explicit, not dependent on embedding model semantics
Tree search excels when:
  • Documents have clear hierarchical structure
  • Section relationships matter for understanding
  • You need to explain retrieval decisions
  • Domain expertise can guide search priority
The simplest strategy is to use an LLM to analyze the tree structure and select relevant nodes.

Simple Tree Search Prompt

from pageindex import page_index
import json

# Generate tree structure
result = page_index('document.pdf', if_add_node_summary='yes')
tree_structure = result['structure']

# Search query
query = "What are the key findings about customer retention?"

# Tree search prompt
prompt = f"""
You are given a query and the tree structure of a document.
You need to find all nodes that are likely to contain the answer.

Query: {query}

Document tree structure: {json.dumps(tree_structure, indent=2)}

Reply in the following JSON format:
{{
  "thinking": <your reasoning about which nodes are relevant>,
  "node_list": [node_id1, node_id2, ...]
}}
"""

# Call LLM (example with OpenAI)
import openai
response = openai.ChatCompletion.create(
    model="gpt-4o-2024-11-20",
    messages=[{"role": "user", "content": prompt}],
    response_format={"type": "json_object"}
)

result = json.loads(response.choices[0].message.content)
selected_nodes = result['node_list']
print(f"Selected nodes: {selected_nodes}")
print(f"Reasoning: {result['thinking']}")

Retrieving Node Content

Once you have selected node IDs, retrieve their content:
def find_node_by_id(structure, node_id):
    """Recursively find a node by its ID."""
    for node in structure:
        if node.get('node_id') == node_id:
            return node
        if 'nodes' in node:
            found = find_node_by_id(node['nodes'], node_id)
            if found:
                return found
    return None

# Retrieve selected nodes
selected_content = []
for node_id in selected_nodes:
    node = find_node_by_id(tree_structure, node_id)
    if node:
        selected_content.append({
            'node_id': node_id,
            'title': node['title'],
            'summary': node.get('summary', ''),
            'text': node.get('text', '')  # if you generated with --if-add-node-text yes
        })

print(json.dumps(selected_content, indent=2))

Advanced Search Strategies

1. Hierarchical Expansion

When a parent node is relevant, include its children:
def get_node_with_children(structure, node_id):
    """Get a node and all its descendants."""
    node = find_node_by_id(structure, node_id)
    if not node:
        return None
    
    result = {
        'node_id': node['node_id'],
        'title': node['title'],
        'summary': node.get('summary', ''),
        'children': []
    }
    
    if 'nodes' in node:
        for child in node['nodes']:
            result['children'].append(get_node_with_children([child], child['node_id']))
    
    return result

# Get node with full subtree
expanded_node = get_node_with_children(tree_structure, '0003')
Search at different granularity levels:
1

Level 1: Top-Level Selection

First, identify relevant top-level sections:
top_level_prompt = f"""
Query: {query}

Top-level sections:
{json.dumps([{"node_id": n["node_id"], "title": n["title"], "summary": n.get("summary")} 
              for n in tree_structure], indent=2)}

Select the most relevant top-level sections.
"""
2

Level 2: Subsection Selection

For each selected top-level section, search its subsections:
for parent_id in selected_top_level:
    parent_node = find_node_by_id(tree_structure, parent_id)
    if 'nodes' in parent_node:
        subsection_prompt = f"""
        Query: {query}
        
        Parent section: {parent_node['title']}
        
        Subsections:
        {json.dumps(parent_node['nodes'], indent=2)}
        
        Select relevant subsections.
        """
3

Level 3: Final Content Retrieval

Retrieve the most specific relevant content.

3. Similarity + Structure Hybrid

Combine embedding similarity with structural search:
import numpy as np
from openai import OpenAI

client = OpenAI()

def get_embedding(text):
    """Get embedding for text."""
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

def cosine_similarity(a, b):
    """Calculate cosine similarity."""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Get query embedding
query_embedding = get_embedding(query)

# Calculate similarity for each node
node_scores = []
for node in tree_structure:
    node_text = f"{node['title']} {node.get('summary', '')}"
    node_embedding = get_embedding(node_text)
    similarity = cosine_similarity(query_embedding, node_embedding)
    node_scores.append({
        'node_id': node['node_id'],
        'title': node['title'],
        'similarity': similarity
    })

# Sort by similarity
top_nodes = sorted(node_scores, key=lambda x: x['similarity'], reverse=True)[:5]

# Use LLM to refine selection based on structure
refinement_prompt = f"""
Query: {query}

Top similar nodes by embedding:
{json.dumps(top_nodes, indent=2)}

Full tree structure:
{json.dumps(tree_structure, indent=2)}

Consider both similarity scores and document structure to select the most relevant nodes.
Also suggest any related parent/child nodes that should be included for context.
"""

Integrating Expert Knowledge

Unlike vector-based RAG, PageIndex makes it easy to incorporate domain expertise and user preferences.
1

Define Expert Knowledge

Create domain-specific search rules:
expert_knowledge = {
    "financial_queries": {
        "keywords": ["revenue", "EBITDA", "earnings", "financial"],
        "priority_sections": [
            "Item 7 (Management Discussion & Analysis)",
            "Item 8 (Financial Statements)"
        ],
        "guidance": "For financial metrics, prioritize MD&A and financial statements over other sections."
    },
    "risk_queries": {
        "keywords": ["risk", "uncertainty", "challenge"],
        "priority_sections": [
            "Item 1A (Risk Factors)",
            "Forward-Looking Statements"
        ],
        "guidance": "Risk-related queries should focus on the Risk Factors section."
    }
}
2

Match Query to Knowledge Domain

Identify which expert knowledge applies:
def detect_query_domain(query, knowledge_base):
    """Detect which domain the query belongs to."""
    query_lower = query.lower()
    
    for domain, info in knowledge_base.items():
        if any(keyword in query_lower for keyword in info['keywords']):
            return domain, info
    
    return None, None

domain, knowledge = detect_query_domain(query, expert_knowledge)
3

Enhanced Search with Preferences

Incorporate knowledge into the search prompt:
if knowledge:
    enhanced_prompt = f"""
    You are given a question and a tree structure of a document.
    You need to find all nodes that are likely to contain the answer.
    
    Query: {query}
    
    Document tree structure: {json.dumps(tree_structure, indent=2)}
    
    Expert Knowledge: {knowledge['guidance']}
    
    Priority Sections: {json.dumps(knowledge['priority_sections'])}
    
    Reply in the following JSON format:
    {{
      "thinking": <reasoning about which nodes are relevant, considering expert knowledge>,
      "node_list": [node_id1, node_id2, ...]
    }}
    """
else:
    # Use basic prompt
    enhanced_prompt = basic_search_prompt
def search_10k_document(query, tree_structure):
    """Search a 10-K document with financial domain knowledge."""
    
    # Define 10-K specific preferences
    financial_10k_knowledge = """
    For 10-K filings:
    - Financial metrics (revenue, earnings, EBITDA) → Item 7 (MD&A) and Item 8
    - Business overview → Item 1 (Business)
    - Risk factors → Item 1A (Risk Factors)
    - Legal proceedings → Item 3 (Legal Proceedings)
    - Management team → Item 10 (Directors and Officers)
    """
    
    prompt = f"""
    You are an expert financial analyst searching a 10-K filing.
    
    Query: {query}
    
    Document tree structure: {json.dumps(tree_structure, indent=2)}
    
    Expert Knowledge:
    {financial_10k_knowledge}
    
    Select the most relevant sections based on both the query and your knowledge
    of 10-K document structure. Explain your reasoning.
    
    Reply in JSON format:
    {{
      "thinking": "<reasoning>",
      "node_list": ["node_id1", "node_id2", ...]
    }}
    """
    
    return prompt

# Example usage
query = "What were the EBITDA adjustments in Q4?"
tree = page_index('10k_filing.pdf', if_add_node_summary='yes')['structure']
search_prompt = search_10k_document(query, tree)

Advanced: Monte Carlo Tree Search (MCTS)

PageIndex’s dashboard and retrieval API use a combination of LLM tree search and value function-based Monte Carlo Tree Search (MCTS). Full implementation details will be released soon.

MCTS Overview

MCTS is a search algorithm that:
  1. Selection: Navigate the tree using a value function
  2. Expansion: Expand promising nodes
  3. Simulation: Evaluate node relevance
  4. Backpropagation: Update node values

Simplified MCTS Example

import math
from collections import defaultdict

class TreeSearchMCTS:
    def __init__(self, tree_structure, query, model="gpt-4o-2024-11-20"):
        self.tree = tree_structure
        self.query = query
        self.model = model
        self.visits = defaultdict(int)
        self.scores = defaultdict(float)
    
    def ucb1_score(self, node_id, parent_visits):
        """Upper Confidence Bound formula for balancing exploration/exploitation."""
        if self.visits[node_id] == 0:
            return float('inf')
        
        exploitation = self.scores[node_id] / self.visits[node_id]
        exploration = math.sqrt(2 * math.log(parent_visits) / self.visits[node_id])
        return exploitation + exploration
    
    def evaluate_node(self, node):
        """Evaluate node relevance using LLM."""
        prompt = f"""
        Query: {self.query}
        
        Node:
        - Title: {node['title']}
        - Summary: {node.get('summary', '')}
        
        Rate the relevance of this node to the query on a scale of 0-1.
        Reply with just a number.
        """
        
        # Call LLM and parse score
        # score = call_llm(prompt)
        score = 0.8  # Placeholder
        return score
    
    def search(self, iterations=100):
        """Perform MCTS iterations."""
        for _ in range(iterations):
            # Selection phase
            current_node = self.select_node()
            
            # Evaluation phase
            score = self.evaluate_node(current_node)
            
            # Backpropagation phase
            self.backpropagate(current_node['node_id'], score)
        
        # Return top nodes
        return self.get_best_nodes(k=5)

Retrieval Best Practices

Summary-First Approach: Use if_add_node_summary=yes for search, then retrieve full text only for selected nodes.
Context Window Management: When retrieving multiple nodes, consider:
  • Parent node context (prefix_summary)
  • Sibling nodes for completeness
  • Total token count for LLM context window
Avoid Over-Retrieval: Don’t retrieve entire subtrees unless necessary. Be selective to stay within context limits.
Cache Embeddings: If using hybrid search, cache node embeddings to avoid recalculating for every query.

Evaluation Metrics

Evaluate your search strategy:

Precision and Recall

def evaluate_retrieval(retrieved_nodes, ground_truth_nodes):
    """Calculate precision and recall."""
    retrieved_set = set(retrieved_nodes)
    ground_truth_set = set(ground_truth_nodes)
    
    true_positives = len(retrieved_set & ground_truth_set)
    
    precision = true_positives / len(retrieved_set) if retrieved_set else 0
    recall = true_positives / len(ground_truth_set) if ground_truth_set else 0
    
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

Mean Reciprocal Rank (MRR)

def calculate_mrr(retrieved_nodes, relevant_nodes):
    """Calculate Mean Reciprocal Rank."""
    for rank, node_id in enumerate(retrieved_nodes, start=1):
        if node_id in relevant_nodes:
            return 1.0 / rank
    return 0.0
import json
from openai import OpenAI
from pageindex import page_index

client = OpenAI()

def search_document(pdf_path, query):
    """Complete end-to-end search pipeline."""
    
    # Step 1: Generate tree structure with summaries
    print("Generating tree structure...")
    result = page_index(
        pdf_path,
        if_add_node_summary='yes',
        if_add_node_id='yes'
    )
    tree = result['structure']
    
    # Step 2: Search with LLM
    print("Searching tree structure...")
    search_prompt = f"""
    Query: {query}
    
    Document tree: {json.dumps(tree, indent=2)}
    
    Find the most relevant nodes. Reply in JSON:
    {{
      "thinking": "<reasoning>",
      "node_list": ["id1", "id2", ...]
    }}
    """
    
    response = client.chat.completions.create(
        model="gpt-4o-2024-11-20",
        messages=[{"role": "user", "content": search_prompt}],
        response_format={"type": "json_object"}
    )
    
    search_result = json.loads(response.choices[0].message.content)
    
    # Step 3: Retrieve node content
    print("Retrieving content...")
    retrieved_content = []
    for node_id in search_result['node_list']:
        node = find_node_by_id(tree, node_id)
        if node:
            retrieved_content.append(node)
    
    # Step 4: Generate answer
    print("Generating answer...")
    context = "\n\n".join([
        f"Section: {node['title']}\n{node.get('summary', '')}"
        for node in retrieved_content
    ])
    
    answer_prompt = f"""
    Query: {query}
    
    Relevant context:
    {context}
    
    Answer the query based on the context provided.
    """
    
    answer_response = client.chat.completions.create(
        model="gpt-4o-2024-11-20",
        messages=[{"role": "user", "content": answer_prompt}]
    )
    
    return {
        'query': query,
        'reasoning': search_result['thinking'],
        'selected_nodes': search_result['node_list'],
        'answer': answer_response.choices[0].message.content
    }

# Usage
result = search_document('annual_report.pdf', 'What were the main revenue drivers in 2024?')
print(json.dumps(result, indent=2))

Next Steps

Build docs developers (and LLMs) love