Skip to main content

Document Routing Algorithm

The document routing system (detectar_documentos) uses a sophisticated three-level scoring algorithm to select the most relevant documents for a query. This is critical because sending irrelevant documents to the LLM wastes context window capacity and degrades response quality.

Algorithm Overview

Implementation

siaa_proxy.py
def detectar_documentos(pregunta: str, max_docs: int = MAX_DOCS_CONTEXTO) -> list:
    p = pregunta.lower()
    
    # Tokenize query with alphanumeric tokenizer
    palabras_pregunta = set(tokenizar(p))
    palabras_3plus = set(re.findall(r'\b[a-záéíóúüñ0-9]{3,}\b', p))
    palabras_filtradas = [w for w in palabras_pregunta if w not in STOPWORDS_ES]
    
    # Get thread-safe snapshots
    with colecciones_lock:
        snap_cols = dict(colecciones)
    with documentos_lock:
        snap_docs = dict(documentos_cargados)
    with indice_densidad_lock:
        snap_idx = dict(indice_densidad)
    
    N = len(snap_docs) or 1
    
    # === LEVEL 1: TF-IDF SCORE ===
    scores_tfidf = defaultdict(float)
    
    # Auto-generated keywords from TF-IDF analysis
    for col in snap_cols.values():
        for nombre_doc, keywords in col.get("keywords", {}).items():
            if nombre_doc not in snap_docs:
                continue
            for kw in keywords:
                if kw in p:
                    scores_tfidf[nombre_doc] += 1.0
    
    # Manual keywords (2x weight - more specific)
    for nombre_doc, kws_manuales in KEYWORDS_MANUALES.items():
        if nombre_doc in snap_docs:
            for kw in kws_manuales:
                if kw in p:
                    scores_tfidf[nombre_doc] += 2.0
    
    # === LEVEL 2: DENSITY SCORE ===
    scores_densidad = defaultdict(float)
    for termino in palabras_filtradas:
        if termino not in snap_idx:
            continue
        df_t = len(snap_idx[termino])
        idf_aprox = math.log((N + 1) / (df_t + 1)) + 1
        # Only top 5 docs per term to prevent noise
        for densidad, nombre_doc in snap_idx[termino][:5]:
            scores_densidad[nombre_doc] += densidad * idf_aprox
    
    # === LEVEL 3: FILENAME SCORE ===
    scores_nombre = defaultdict(float)
    for nombre_doc, doc in snap_docs.items():
        tokens_nombre = doc.get("tokens_nombre", set())
        coincidencias = tokens_nombre & palabras_3plus
        if coincidencias:
            # Jaccard similarity: |intersection| / |union|
            scores_nombre[nombre_doc] = len(coincidencias) / (len(tokens_nombre) or 1)
    
    # === WEIGHTED COMBINATION ===
    scores_combinados = defaultdict(float)
    for doc, s in scores_tfidf.items():
        scores_combinados[doc] += s * 2.0      # TF-IDF weight: 2.0
    for doc, s in scores_densidad.items():
        scores_combinados[doc] += s * 1.0      # Density weight: 1.0
    for doc, s in scores_nombre.items():
        scores_combinados[doc] += s * 1.5      # Filename weight: 1.5
    
    if scores_combinados:
        ordenados = sorted(scores_combinados.keys(),
                          key=lambda d: scores_combinados[d], reverse=True)
        resultado = ordenados[:max_docs]
        log_scores = [(d, round(scores_combinados[d], 4)) for d in resultado]
        print(f"  [ENRUTADOR] max={max_docs} {log_scores}", flush=True)
        return resultado
    
    # === FALLBACK: VOCABULARY OVERLAP ===
    scored = []
    for nombre, doc in snap_docs.items():
        c = len(palabras_pregunta & doc["palabras"])
        if c >= 2:  # Minimum 2 shared words
            scored.append((c, nombre))
    scored.sort(reverse=True)
    return [n for _, n in scored[:max_docs]]

Level 1: TF-IDF Keyword Matching

Auto-generated Keywords

At startup, SIAA computes the top 20 TF-IDF keywords for each document:
siaa_proxy.py
TOP_KEYWORDS_POR_DOC = 20
MIN_FREQ_KEYWORD = 2
MIN_LEN_KEYWORD = 3

def calcular_tfidf_coleccion(documentos: dict) -> dict:
    tokens_por_doc = {n: tokenizar(d["contenido"]) for n, d in documentos.items()}
    N = len(documentos)
    
    # Document frequency: how many docs contain each term
    df = defaultdict(int)
    for tokens in tokens_por_doc.values():
        for t in set(tokens):
            df[t] += 1
    
    keywords_por_doc = {}
    for nombre, tokens in tokens_por_doc.items():
        if not tokens:
            keywords_por_doc[nombre] = []
            continue
        
        conteo = Counter(tokens)
        total_tokens = len(tokens)
        scores = {}
        
        for termino, freq in conteo.items():
            if freq < MIN_FREQ_KEYWORD or len(termino) < MIN_LEN_KEYWORD:
                continue
            
            # TF-IDF formula
            tf = freq / total_tokens
            idf = math.log((N + 1) / (df[termino] + 1)) + 1
            scores[termino] = tf * idf
        
        # Take top 20 keywords
        top = sorted(scores.keys(), key=lambda k: scores[k], reverse=True)
        keywords_por_doc[nombre] = top[:TOP_KEYWORDS_POR_DOC]
    
    return keywords_por_doc

Manual Keyword Supplementation

Certain domain-specific terms don’t rank high in automatic TF-IDF but are critical for routing:
siaa_proxy.py
KEYWORDS_MANUALES = {
    "acuerdo_pcsja19-11207.md": [
        "capacitacion", "capacita", "capacitar", "quien capacita",
        "cendoj", "udae", "unidad de desarrollo", "analisis estadistico",
        "presentacion de informes", "primer informe",
    ],
    "acuerdo_no._psaa16-10476.md": [
        # Definition queries
        "que es sierju", "que es el sierju", "para que sirve", "objeto",
        "sistema de informacion", "de que trata", "proposito", "finalidad",
        # Periodicity and deadlines
        "sierju", "periodicidad", "reportar", "quinto dia habil",
        # Roles (article 7)
        "roles", "super administrador", "funcionario", "juez", "magistrado",
        # Responsibilities (article 5)
        "responsable", "quien carga", "cargar informacion",
        # Sanctions (articles 19-20)
        "sancion", "incumplimiento", "no reporto", "consecuencia",
        "disciplinario", "no reportar", "castigo",
    ],
}
Manual keywords receive 2x weight to ensure queries about sanctions, roles, or specific forms route to the correct document even if those terms appear infrequently.

Level 2: Term Density Scoring

The density index tracks relative frequency of each term within each document:
siaa_proxy.py
# Build inverted index: term → [(density, doc_name), ...]
nuevo_indice = defaultdict(list)
for nombre_doc, doc in todos_los_docs.items():
    total = doc["total_tokens"]
    if total == 0:
        continue
    for termino, freq in doc["token_count"].items():
        if len(termino) >= MIN_LEN_KEYWORD:
            nuevo_indice[termino].append((freq / total, nombre_doc))

# Sort by density (highest first)
for t in nuevo_indice:
    nuevo_indice[t].sort(reverse=True)

Why Density Matters

A document with 100 occurrences of “sanción” in 5,000 tokens has density = 0.02 (2%). This document is likely MORE relevant for sanction queries than a document with 50 occurrences in 50,000 tokens (density = 0.001).
Density scoring prevents long documents from dominating purely due to size. A 200KB document isn’t automatically ranked higher than a focused 20KB document.

Level 3: Filename Matching

Filenames often contain critical identifiers:
siaa_proxy.py
def _tokens_nombre_archivo(nombre_clave: str) -> set:
    """Extract tokens from filename for matching."""
    sin_ext = os.path.splitext(nombre_clave)[0]
    partes = re.split(r'[_\s\-\.]+', sin_ext.lower())
    # Include alphanumeric parts (psaa16, 10476)
    return {p for p in partes if len(p) >= 3}

# Example: "acuerdo_no._psaa16-10476.md" → {"acuerdo", "psaa16", "10476"}
If the user asks “¿Qué dice el PSAA16?”, filename matching ensures the document acuerdo_no._psaa16-10476.md receives a strong boost even if “psaa16” appears in other documents.

Score Combination Formula

score_final = (score_tfidf × 2.0) + (score_densidad × 1.0) + (score_nombre × 1.5)

Weight Rationale

ComponentWeightReasoning
TF-IDF2.0Most reliable signal; captures document topic
Density1.0Baseline weight; prevents size bias
Filename1.5Strong signal when user mentions specific document codes

Real Examples with Scores

Example 1: “¿Cuándo debo reportar en SIERJU?”

Tokenized: ["cuando", "debo", "reportar", "sierju"] Routing output:
[ENRUTADOR] max=2 [
    ('acuerdo_no._psaa16-10476.md', 8.4521),
    ('acuerdo_pcsja19-11207.md', 2.1053)
]
Score breakdown for acuerdo_no._psaa16-10476.md:
  • TF-IDF: “sierju” matches keyword → +1.0, “reportar” matches manual keyword → +2.0
  • Density: “reportar” density=0.008, IDF=2.1 → +0.017
  • Filename: “psaa16” not in query → +0.0
  • Combined: (3.0 × 2.0) + (0.017 × 1.0) + 0 = 6.017
Manual keyword boost for “reportar” ensures the PSAA16 document (which defines reporting deadlines) ranks first.

Example 2: “¿Qué sanciones hay por no reportar?”

Tokenized: ["sanciones", "reportar"] Routing output:
[ENRUTADOR] max=2 [
    ('acuerdo_no._psaa16-10476.md', 12.8934),
    ('manual_procedimientos.md', 1.3421)
]
Score breakdown:
  • TF-IDF: “sanciones” → +1.0, “reportar” (manual) → +2.0
  • Density: “sanción” appears 18 times in 1,542 tokens (density=0.0117), IDF=2.8 → +0.033
  • Filename: no match → +0.0
  • Combined: (3.0 × 2.0) + (0.033 × 1.0) = 6.033
Actual score is higher due to multiple related terms (“incumplimiento”, “disciplinario”).

Example 3: “Explica el acuerdo PSAA16”

Tokenized: ["explica", "acuerdo", "psaa16"] Routing output:
[ENRUTADOR] max=1 [
    ('acuerdo_no._psaa16-10476.md', 15.2103)
]
Score breakdown:
  • TF-IDF: “acuerdo” → +1.0
  • Density: “acuerdo” appears frequently → +0.025
  • Filename: “acuerdo” matches, “psaa16” matches → +1.5 (2/2 tokens = 100%)
  • Combined: (1.0 × 2.0) + (0.025 × 1.0) + (1.0 × 1.5) = 3.525
Filename match provides the decisive boost, ensuring exact document selection.

Fallback: Vocabulary Overlap

If all three levels produce zero scores (no keyword/density/filename matches), fall back to simple vocabulary overlap:
siaa_proxy.py
scored = []
for nombre, doc in snap_docs.items():
    c = len(palabras_pregunta & doc["palabras"])
    if c >= 2:  # Minimum 2 shared words
        scored.append((c, nombre))
scored.sort(reverse=True)
return [n for _, n in scored[:max_docs]]
This ensures the system gracefully handles queries with uncommon vocabulary.

Tokenization Strategy

The tokenizer includes alphanumeric codes and long numbers:
siaa_proxy.py
def tokenizar(texto: str) -> list:
    """Alphanumeric tokenizer with smart number filtering."""
    tokens_raw = re.findall(r'\b[a-záéíóúüñ0-9]{3,}\b', texto.lower())
    
    resultado = []
    for p in tokens_raw:
        if p in STOPWORDS_ES:
            continue
        es_solo_digitos = p.isdigit()
        if es_solo_digitos:
            # Only include numbers with 4+ digits (years, codes)
            if len(p) >= 4:
                resultado.append(p)  # "10476", "2016" → included
            # Discard short numbers ("1", "22", "999")
        else:
            # Token with letters: always include
            resultado.append(p)  # "psaa16", "art5" → included
    
    return resultado

Why Include Alphanumeric Codes?

Before fix: “psaa16” was split into “psaa” + discarded “16” → routing failed
After fix: “psaa16” kept intact → correctly routes to acuerdo_no._psaa16-10476.md
Similarly, “10476” (agreement number) is now a valid search token.

Testing the Router

Diagnostic endpoint to test routing without querying the LLM:
curl "http://localhost:5000/siaa/enrutar?q=¿Cuáles+son+las+sanciones+por+no+reportar?"
Response:
{
  "pregunta": "¿Cuáles son las sanciones por no reportar?",
  "doc_especifico": false,
  "max_docs_usados": 2,
  "docs_encontrados": [
    {
      "doc": "acuerdo_no._psaa16-10476.md",
      "tamano": 45231,
      "coleccion": "general",
      "chunks": 38
    }
  ]
}

Next Steps

Build docs developers (and LLMs) love