The document routing system (detectar_documentos) uses a sophisticated three-level scoring algorithm to select the most relevant documents for a query. This is critical because sending irrelevant documents to the LLM wastes context window capacity and degrades response quality.
def detectar_documentos(pregunta: str, max_docs: int = MAX_DOCS_CONTEXTO) -> list: p = pregunta.lower() # Tokenize query with alphanumeric tokenizer palabras_pregunta = set(tokenizar(p)) palabras_3plus = set(re.findall(r'\b[a-záéíóúüñ0-9]{3,}\b', p)) palabras_filtradas = [w for w in palabras_pregunta if w not in STOPWORDS_ES] # Get thread-safe snapshots with colecciones_lock: snap_cols = dict(colecciones) with documentos_lock: snap_docs = dict(documentos_cargados) with indice_densidad_lock: snap_idx = dict(indice_densidad) N = len(snap_docs) or 1 # === LEVEL 1: TF-IDF SCORE === scores_tfidf = defaultdict(float) # Auto-generated keywords from TF-IDF analysis for col in snap_cols.values(): for nombre_doc, keywords in col.get("keywords", {}).items(): if nombre_doc not in snap_docs: continue for kw in keywords: if kw in p: scores_tfidf[nombre_doc] += 1.0 # Manual keywords (2x weight - more specific) for nombre_doc, kws_manuales in KEYWORDS_MANUALES.items(): if nombre_doc in snap_docs: for kw in kws_manuales: if kw in p: scores_tfidf[nombre_doc] += 2.0 # === LEVEL 2: DENSITY SCORE === scores_densidad = defaultdict(float) for termino in palabras_filtradas: if termino not in snap_idx: continue df_t = len(snap_idx[termino]) idf_aprox = math.log((N + 1) / (df_t + 1)) + 1 # Only top 5 docs per term to prevent noise for densidad, nombre_doc in snap_idx[termino][:5]: scores_densidad[nombre_doc] += densidad * idf_aprox # === LEVEL 3: FILENAME SCORE === scores_nombre = defaultdict(float) for nombre_doc, doc in snap_docs.items(): tokens_nombre = doc.get("tokens_nombre", set()) coincidencias = tokens_nombre & palabras_3plus if coincidencias: # Jaccard similarity: |intersection| / |union| scores_nombre[nombre_doc] = len(coincidencias) / (len(tokens_nombre) or 1) # === WEIGHTED COMBINATION === scores_combinados = defaultdict(float) for doc, s in scores_tfidf.items(): scores_combinados[doc] += s * 2.0 # TF-IDF weight: 2.0 for doc, s in scores_densidad.items(): scores_combinados[doc] += s * 1.0 # Density weight: 1.0 for doc, s in scores_nombre.items(): scores_combinados[doc] += s * 1.5 # Filename weight: 1.5 if scores_combinados: ordenados = sorted(scores_combinados.keys(), key=lambda d: scores_combinados[d], reverse=True) resultado = ordenados[:max_docs] log_scores = [(d, round(scores_combinados[d], 4)) for d in resultado] print(f" [ENRUTADOR] max={max_docs} {log_scores}", flush=True) return resultado # === FALLBACK: VOCABULARY OVERLAP === scored = [] for nombre, doc in snap_docs.items(): c = len(palabras_pregunta & doc["palabras"]) if c >= 2: # Minimum 2 shared words scored.append((c, nombre)) scored.sort(reverse=True) return [n for _, n in scored[:max_docs]]
At startup, SIAA computes the top 20 TF-IDF keywords for each document:
siaa_proxy.py
TOP_KEYWORDS_POR_DOC = 20MIN_FREQ_KEYWORD = 2MIN_LEN_KEYWORD = 3def calcular_tfidf_coleccion(documentos: dict) -> dict: tokens_por_doc = {n: tokenizar(d["contenido"]) for n, d in documentos.items()} N = len(documentos) # Document frequency: how many docs contain each term df = defaultdict(int) for tokens in tokens_por_doc.values(): for t in set(tokens): df[t] += 1 keywords_por_doc = {} for nombre, tokens in tokens_por_doc.items(): if not tokens: keywords_por_doc[nombre] = [] continue conteo = Counter(tokens) total_tokens = len(tokens) scores = {} for termino, freq in conteo.items(): if freq < MIN_FREQ_KEYWORD or len(termino) < MIN_LEN_KEYWORD: continue # TF-IDF formula tf = freq / total_tokens idf = math.log((N + 1) / (df[termino] + 1)) + 1 scores[termino] = tf * idf # Take top 20 keywords top = sorted(scores.keys(), key=lambda k: scores[k], reverse=True) keywords_por_doc[nombre] = top[:TOP_KEYWORDS_POR_DOC] return keywords_por_doc
Certain domain-specific terms don’t rank high in automatic TF-IDF but are critical for routing:
siaa_proxy.py
KEYWORDS_MANUALES = { "acuerdo_pcsja19-11207.md": [ "capacitacion", "capacita", "capacitar", "quien capacita", "cendoj", "udae", "unidad de desarrollo", "analisis estadistico", "presentacion de informes", "primer informe", ], "acuerdo_no._psaa16-10476.md": [ # Definition queries "que es sierju", "que es el sierju", "para que sirve", "objeto", "sistema de informacion", "de que trata", "proposito", "finalidad", # Periodicity and deadlines "sierju", "periodicidad", "reportar", "quinto dia habil", # Roles (article 7) "roles", "super administrador", "funcionario", "juez", "magistrado", # Responsibilities (article 5) "responsable", "quien carga", "cargar informacion", # Sanctions (articles 19-20) "sancion", "incumplimiento", "no reporto", "consecuencia", "disciplinario", "no reportar", "castigo", ],}
Manual keywords receive 2x weight to ensure queries about sanctions, roles, or specific forms route to the correct document even if those terms appear infrequently.
The density index tracks relative frequency of each term within each document:
siaa_proxy.py
# Build inverted index: term → [(density, doc_name), ...]nuevo_indice = defaultdict(list)for nombre_doc, doc in todos_los_docs.items(): total = doc["total_tokens"] if total == 0: continue for termino, freq in doc["token_count"].items(): if len(termino) >= MIN_LEN_KEYWORD: nuevo_indice[termino].append((freq / total, nombre_doc))# Sort by density (highest first)for t in nuevo_indice: nuevo_indice[t].sort(reverse=True)
A document with 100 occurrences of “sanción” in 5,000 tokens has density = 0.02 (2%). This document is likely MORE relevant for sanction queries than a document with 50 occurrences in 50,000 tokens (density = 0.001).
Density scoring prevents long documents from dominating purely due to size. A 200KB document isn’t automatically ranked higher than a focused 20KB document.
def _tokens_nombre_archivo(nombre_clave: str) -> set: """Extract tokens from filename for matching.""" sin_ext = os.path.splitext(nombre_clave)[0] partes = re.split(r'[_\s\-\.]+', sin_ext.lower()) # Include alphanumeric parts (psaa16, 10476) return {p for p in partes if len(p) >= 3}# Example: "acuerdo_no._psaa16-10476.md" → {"acuerdo", "psaa16", "10476"}
If the user asks “¿Qué dice el PSAA16?”, filename matching ensures the document acuerdo_no._psaa16-10476.md receives a strong boost even if “psaa16” appears in other documents.
If all three levels produce zero scores (no keyword/density/filename matches), fall back to simple vocabulary overlap:
siaa_proxy.py
scored = []for nombre, doc in snap_docs.items(): c = len(palabras_pregunta & doc["palabras"]) if c >= 2: # Minimum 2 shared words scored.append((c, nombre))scored.sort(reverse=True)return [n for _, n in scored[:max_docs]]
This ensures the system gracefully handles queries with uncommon vocabulary.
The tokenizer includes alphanumeric codes and long numbers:
siaa_proxy.py
def tokenizar(texto: str) -> list: """Alphanumeric tokenizer with smart number filtering.""" tokens_raw = re.findall(r'\b[a-záéíóúüñ0-9]{3,}\b', texto.lower()) resultado = [] for p in tokens_raw: if p in STOPWORDS_ES: continue es_solo_digitos = p.isdigit() if es_solo_digitos: # Only include numbers with 4+ digits (years, codes) if len(p) >= 4: resultado.append(p) # "10476", "2016" → included # Discard short numbers ("1", "22", "999") else: # Token with letters: always include resultado.append(p) # "psaa16", "art5" → included return resultado
Why Include Alphanumeric Codes?
Before fix: “psaa16” was split into “psaa” + discarded “16” → routing failed After fix: “psaa16” kept intact → correctly routes to acuerdo_no._psaa16-10476.mdSimilarly, “10476” (agreement number) is now a valid search token.