Overview
The hash utilities module provides HMAC-based hashing functions for creating secure, consistent identifiers for embedding inputs. These hashes are used to uniquely identify and deduplicate embedding inputs in the AveniECA system.
Functions
encode()
Generates an HMAC hash of a message using a secret key and specified algorithm.
Import:
from avenieca.utils.hash import encode
Parameters
The message string to hash (typically the embedding input text)
The secret key used for HMAC hashing (should be kept secure and consistent)
The hashing algorithm to use. Supported algorithms:
"sha256" (default)
"sha384"
"sha512"
"sha224"
"sha1"
"sha3_256"
"sha3_224"
"sha3_512"
"md5"
Returns
str - Base64-encoded HMAC hash of the message
Raises
Exception - If the specified algorithm is not available
Usage Examples
Basic Hash Generation
from avenieca.utils.hash import encode
# Generate hash for embedding input
input_text = "The quick brown fox jumps over the lazy dog"
secret_key = "your-secret-key"
hash_value = encode(input_text, secret_key)
print(hash_value)
# Output: "aVXOlXkPCdHhC8qyWOlKBxbpJbO7E+N3Xs4qhwbJAeg="
from avenieca.utils.hash import encode
from avenieca.api.model import EmbeddingInputInsert
from avenieca import Client
# Create hash for embedding input
input_text = "Machine learning model architecture"
module_id = "text_embeddings"
secret = "production-secret-key"
hash_value = encode(input_text, secret)
# Create embedding input record
embedding_input = EmbeddingInputInsert(
module_id=module_id,
input=input_text,
hash=hash_value
)
# Submit to API
client = Client(config=config)
response = client.insert_embedding_input(embedding_input)
print(f"Embedding input ID: {response.id}")
Deduplication with Hashes
from avenieca.utils.hash import encode
from avenieca.api.model import EmbeddingInputHash
from avenieca import Client
client = Client(config=config)
secret = "your-secret-key"
# Check if input already exists
input_text = "Natural language processing"
hash_value = encode(input_text, secret)
# Query by hash to avoid duplicates
try:
existing = client.get_embedding_input_by_hash(
EmbeddingInputHash(hash=hash_value)
)
print(f"Input already exists with ID: {existing.id}")
except:
# Input doesn't exist, create new one
new_input = EmbeddingInputInsert(
module_id="nlp_module",
input=input_text,
hash=hash_value
)
response = client.insert_embedding_input(new_input)
print(f"Created new input with ID: {response.id}")
Using Different Hash Algorithms
from avenieca.utils.hash import encode
input_text = "Deep learning neural networks"
secret = "my-secret"
# SHA-256 (default, recommended)
hash_sha256 = encode(input_text, secret, algorithm="sha256")
print(f"SHA-256: {hash_sha256}")
# SHA-512 (more secure, longer hash)
hash_sha512 = encode(input_text, secret, algorithm="sha512")
print(f"SHA-512: {hash_sha512}")
# SHA3-256 (newer standard)
hash_sha3 = encode(input_text, secret, algorithm="sha3_256")
print(f"SHA3-256: {hash_sha3}")
Consistent Hashing Across Sessions
import os
from avenieca.utils.hash import encode
# Use environment variable for consistent secret
SECRET_KEY = os.getenv("AVENI_HASH_SECRET", "default-secret")
def get_input_hash(text: str) -> str:
"""Generate consistent hash for any input text."""
return encode(text, SECRET_KEY, algorithm="sha256")
# These will produce the same hash every time
hash1 = get_input_hash("Hello, world!")
hash2 = get_input_hash("Hello, world!")
assert hash1 == hash2 # True
# Different inputs produce different hashes
hash3 = get_input_hash("Goodbye, world!")
assert hash1 != hash3 # True
Security Best Practices
Keep your secret key secure and never commit it to version control. Use environment variables or secure configuration management.
Environment-Based Secrets
import os
from avenieca.utils.hash import encode
# Good - Load secret from environment
SECRET = os.getenv("AVENI_HASH_SECRET")
if not SECRET:
raise ValueError("AVENI_HASH_SECRET environment variable not set")
hash_value = encode("my input", SECRET)
# Bad - Hardcoded secret (DON'T DO THIS)
hash_value = encode("my input", "hardcoded-secret-123") # Never hardcode!
Consistent Secret Across Environments
import os
from avenieca.utils.hash import encode
class HashConfig:
"""Centralized hash configuration."""
def __init__(self):
self.secret = os.getenv("AVENI_HASH_SECRET")
if not self.secret:
raise ValueError("Hash secret not configured")
self.algorithm = os.getenv("AVENI_HASH_ALGO", "sha256")
def hash_input(self, text: str) -> str:
return encode(text, self.secret, self.algorithm)
# Use throughout your application
config = HashConfig()
hash1 = config.hash_input("first input")
hash2 = config.hash_input("second input")
Common Use Cases
Caching Embeddings
from avenieca.utils.hash import encode
import json
# Cache embeddings by hash to avoid recomputation
embedding_cache = {}
secret = "cache-secret"
def get_or_compute_embedding(text: str, model):
"""Get cached embedding or compute new one."""
hash_key = encode(text, secret)
if hash_key in embedding_cache:
print("Cache hit")
return embedding_cache[hash_key]
print("Cache miss - computing embedding")
embedding = model.encode(text)
embedding_cache[hash_key] = embedding.tolist()
return embedding_cache[hash_key]
# First call computes
embed1 = get_or_compute_embedding("Hello world", model)
# Second call uses cache
embed2 = get_or_compute_embedding("Hello world", model)
Data Integrity Verification
from avenieca.utils.hash import encode
def verify_input_integrity(original_text: str, received_hash: str, secret: str) -> bool:
"""Verify that text matches its hash."""
computed_hash = encode(original_text, secret)
return computed_hash == received_hash
# Example
original = "Important data"
stored_hash = encode(original, "secret")
# Later, verify integrity
if verify_input_integrity(original, stored_hash, "secret"):
print("Data integrity verified")
else:
print("Warning: Data has been modified")
Algorithm Selection Guide
| Algorithm | Hash Length | Use Case |
|---|
sha256 | 44 chars | Recommended - Good balance of security and performance |
sha512 | 88 chars | Maximum security, longer hashes |
sha3_256 | 44 chars | Newer standard, same security as SHA-256 |
sha1 | 28 chars | Legacy support only (less secure) |
md5 | 24 chars | Not recommended (weak security) |
Use sha256 (the default) for most applications. It provides strong security and is widely supported.