Excel to SQLite Conversion
SIAA automatically converts Excel files (.xls, .xlsx) into SQLite database tables for efficient querying and data management.
Conversion Function
The excel_to_sqlite() function handles the complete conversion process:
def excel_to_sqlite ( excel_path : Path, conn : sqlite3.Connection,
table_name : str ) -> tuple[ bool , int , str ]:
suffix = excel_path.suffix.lower()
if suffix not in ( ".xls" , ".xlsx" ):
return False , 0 , f "Extensión Excel no soportada: { suffix } "
try :
engine = "xlrd" if suffix == ".xls" else "openpyxl"
df = pd.read_excel(excel_path, engine = engine)
except Exception as exc:
return False , 0 , f "No se pudo leer Excel: { exc } "
cleaned = [sanitize_column(c) for c in df.columns.tolist()]
df.columns = make_unique_columns(cleaned)
conn.execute( f 'DROP TABLE IF EXISTS " { table_name } "' )
df.to_sql(table_name, conn, if_exists = "replace" , index = False )
return True , int ( len (df)), "Tabla creada/reemplazada."
.xlsx Files Modern Excel format (Office 2007+) using openpyxl engine
.xls Files Legacy Excel format (Office 97-2003) using xlrd engine
Column Sanitization
Column names are sanitized to ensure SQL compatibility:
def sanitize_column ( name : Any) -> str :
return slugify_ascii( "" if name is None else str (name)) or "columna"
def slugify_ascii ( name : str ) -> str :
normalized = unicodedata.normalize( "NFKD" , name)
ascii_only = normalized.encode( "ascii" , "ignore" ).decode( "ascii" )
collapsed = re.sub( r " [ \s \- / ] + " , "_" , ascii_only.lower())
cleaned = re.sub( r " [ ^ a-z0-9_ ] " , "" , collapsed)
cleaned = re.sub( r "_ + " , "_" , cleaned).strip( "_" )
return cleaned or "sin_nombre"
Sanitization Process
Unicode normalization
Convert to NFKD form and remove accents: "Número" → "Numero"
ASCII conversion
Encode to ASCII, ignoring non-ASCII characters
Lowercase and replace separators
Spaces, hyphens, slashes → underscores: "Fecha Inicio" → "fecha_inicio"
Remove invalid characters
Keep only a-z, 0-9, _
Collapse underscores
Multiple underscores → single underscore
Fallback for empty results
If result is empty, use "sin_nombre"
Sanitization Examples
"Nombre Completo" → "nombre_completo"
"Fecha de Nacimiento" → "fecha_de_nacimiento"
"Código (2024)" → "codigo_2024"
"N° Documento" → "n_documento"
"Año/Mes" → "ano_mes"
"Cédula#" → "cedula"
"" → "columna"
None → "columna"
Handling Duplicate Column Names
Duplicate column names are automatically made unique:
def make_unique_columns ( columns : list[ str ]) -> list[ str ]:
used: dict[ str , int ] = {}
unique: list[ str ] = []
for col in columns:
base = col or "columna"
count = used.get(base, 0 ) + 1
used[base] = count
unique.append(base if count == 1 else f " { base } _ { count } " )
return unique
Duplicate Handling Logic
First occurrence keeps original name; subsequent duplicates get _2, _3, etc.
Example:
Original Columns Sanitized Unique Columns "Nombre", "Nombre", "Nombre""nombre", "nombre_2", "nombre_3""", "", "Dato""columna", "columna_2", "dato"
Table Naming
Table names are generated from folder names using the same slugification:
def process_folder ( folder_path : Path, md_dir : Path, conn : sqlite3.Connection,
errors : list[ str ], results : list[FolderResult],
convert_doc : bool ) -> None :
folder_name = folder_path.name
slug = slugify_ascii(folder_name)
# ...
if len (excel_files) == 1 :
ok, rows, msg = excel_to_sqlite(excel_files[ 0 ], conn, slug)
Table Replacement : Existing tables with the same name are dropped before insertion: DROP TABLE IF EXISTS "{table_name}"
Database Structure
Connection and Storage
DEFAULT_DB = Path( "/opt/siaa/institucional.db" ) # Base de datos SQLite
# In main processing:
with sqlite3.connect(db_path) as conn:
for folder in subfolders:
process_folder(folder, dest_md, conn, errors, results, convert_doc)
conn.commit()
Pandas Integration
Conversion uses pandas for data handling:
engine = "xlrd" if suffix == ".xls" else "openpyxl"
df = pd.read_excel(excel_path, engine = engine)
# ... sanitization ...
df.to_sql(table_name, conn, if_exists = "replace" , index = False )
Index Exclusion : index=False prevents pandas row indices from being stored as a column
Querying Converted Data
Once converted, data can be queried using standard SQL:
import sqlite3
conn = sqlite3.connect( '/opt/siaa/institucional.db' )
# List all tables
cursor = conn.execute( "SELECT name FROM sqlite_master WHERE type='table'" )
tables = cursor.fetchall()
# Query specific table
for row in conn.execute( 'SELECT * FROM "juzgado_civil_municipal" LIMIT 10' ):
print (row)
# Count rows
count = conn.execute( 'SELECT COUNT(*) FROM "juzgado_civil_municipal"' ).fetchone()[ 0 ]
print ( f "Total rows: { count } " )
Query Examples
Basic SELECT
Column Search
Aggregations
JOIN Example
SELECT * FROM "juzgado_civil_municipal"
WHERE fecha_inicio >= '2024-01-01'
ORDER BY numero_proceso;
Complete Processing Flow
The system processes folders containing both Word/PDF and Excel files:
def process_folder ( folder_path : Path, md_dir : Path, conn : sqlite3.Connection,
errors : list[ str ], results : list[FolderResult],
convert_doc : bool ) -> None :
folder_name = folder_path.name
slug = slugify_ascii(folder_name)
md_path = md_dir / f " { slug } .md"
result = FolderResult( folder_name = folder_name, slug = slug, md_path = md_path)
files = [p for p in folder_path.iterdir() if p.is_file()]
word_files = [p for p in files if p.suffix.lower() in ( ".doc" , ".docx" )]
pdf_files = [p for p in files if p.suffix.lower() == ".pdf" ]
excel_files = [p for p in files if p.suffix.lower() in ( ".xls" , ".xlsx" )]
# ... Word/PDF processing ...
# Excel → SQLite
if len (excel_files) != 1 :
msg = f "Se esperaba 1 Excel y se encontraron { len (excel_files) } ."
result.sql_ok, result.sql_msg = False , msg
result.errors.append(msg)
else :
try :
ok, rows, msg = excel_to_sqlite(excel_files[ 0 ], conn, slug)
result.sql_ok, result.sql_rows, result.sql_msg = ok, rows, msg
if not ok:
result.errors.append(msg)
except Exception as exc:
msg = f "Fallo Excel→SQLite: { exc } "
result.sql_ok, result.sql_msg = False , msg
result.errors.append(msg)
Folder Expectation : Each folder should contain exactly 1 Excel file. Multiple or zero Excel files trigger a warning.
Verification and Statistics
After conversion, the system provides detailed statistics:
def print_verification ( md_dir : Path, conn : sqlite3.Connection,
results : list[FolderResult], errors : list[ str ]) -> None :
print ( " \n === Tablas SQLite ===" )
for r in [r for r in results if r.sql_ok]:
count = conn.execute( f 'SELECT COUNT(*) FROM " { r.slug } "' ).fetchone()[ 0 ]
print ( f " { r.slug } : { count } filas" )
total = len (results)
sql_ok = sum ( 1 for r in results if r.sql_ok)
print ( f " \n === Resumen ===" )
print ( f " Excel → SQLite : { sql_ok } ✅ { total - sql_ok } ❌" )
Example Output
=== Tablas SQLite ===
juzgado_civil_municipal: 245 filas
juzgado_penal_circuito: 189 filas
registro_personal: 67 filas
=== Resumen ===
Carpetas procesadas : 15
Word/PDF → Markdown : 14 ✅ 1 ❌
Excel → SQLite : 13 ✅ 2 ❌
Carpetas con errores: 3
Error Handling
Unsupported file extension
Returns: (False, 0, f"Extensión Excel no soportada: {suffix}")
Returns: (False, 0, f"No se pudo leer Excel: {exc}")
Warning: "Se esperaba 1 Excel y se encontraron {count}."
Warning: "Se esperaba 1 Excel y se encontraron 0."
Error: f"Fallo Excel→SQLite: {exc}"
Data Types
Pandas automatically infers SQL data types:
Integers Python int / NumPy int64 → SQLite INTEGER
Floats Python float / NumPy float64 → SQLite REAL
Strings Python str / pandas object → SQLite TEXT
Dates pandas datetime64 → SQLite TEXT (ISO format)
Command-Line Usage
Convert All Folders
Custom Database Path
Specific Folder
Multiple Specific Folders
# Process all folders in /opt/siaa/instructivos
python3 convertidor.py
Installation
Install pandas
pip install pandas --break-system-packages
Install Excel engines
pip install openpyxl xlrd --break-system-packages
Verify installation
python3 -c "import pandas; import openpyxl; import xlrd; print('OK')"
Test conversion
python3 convertidor.py --only-folder "Test"
Large Files : Excel files with 100,000+ rows may take several seconds to convert. Consider breaking them into smaller files.
Efficient : SQLite provides fast querying once data is converted, even for large datasets
Integration Example
Complete folder processing example:
from pathlib import Path
import sqlite3
# Setup
db_path = Path( "/opt/siaa/institucional.db" )
conn = sqlite3.connect(db_path)
# Convert
excel_file = Path( "/opt/siaa/instructivos/Juzgado Civil/datos.xlsx" )
table_name = "juzgado_civil"
ok, rows, msg = excel_to_sqlite(excel_file, conn, table_name)
if ok:
print ( f "✅ Converted { rows } rows to table ' { table_name } '" )
# Verify
cursor = conn.execute( f 'SELECT COUNT(*) FROM " { table_name } "' )
count = cursor.fetchone()[ 0 ]
print ( f "Table contains { count } rows" )
# Sample data
cursor = conn.execute( f 'SELECT * FROM " { table_name } " LIMIT 5' )
for row in cursor:
print (row)
else :
print ( f "❌ Conversion failed: { msg } " )
conn.close()
Result Structure
The conversion returns a tuple:
(success: bool , row_count: int , message: str )
Examples:
(True, 245, "Tabla creada/reemplazada.") ✅
(False, 0, "Extensión Excel no soportada: .csv") ❌
(False, 0, "No se pudo leer Excel: File not found") ❌