Skip to main content

Overview

The hash utilities module provides HMAC-based hashing functions for creating secure, consistent identifiers for embedding inputs. These hashes are used to uniquely identify and deduplicate embedding inputs in the AveniECA system.

Functions

encode()

Generates an HMAC hash of a message using a secret key and specified algorithm. Import:
from avenieca.utils.hash import encode

Parameters

msg
str
required
The message string to hash (typically the embedding input text)
secret
str
required
The secret key used for HMAC hashing (should be kept secure and consistent)
algorithm
str
default:"sha256"
The hashing algorithm to use. Supported algorithms:
  • "sha256" (default)
  • "sha384"
  • "sha512"
  • "sha224"
  • "sha1"
  • "sha3_256"
  • "sha3_224"
  • "sha3_512"
  • "md5"

Returns

str - Base64-encoded HMAC hash of the message

Raises

Exception - If the specified algorithm is not available

Usage Examples

Basic Hash Generation

from avenieca.utils.hash import encode

# Generate hash for embedding input
input_text = "The quick brown fox jumps over the lazy dog"
secret_key = "your-secret-key"

hash_value = encode(input_text, secret_key)
print(hash_value)
# Output: "aVXOlXkPCdHhC8qyWOlKBxbpJbO7E+N3Xs4qhwbJAeg="

Creating Embedding Input Records

from avenieca.utils.hash import encode
from avenieca.api.model import EmbeddingInputInsert
from avenieca import Client

# Create hash for embedding input
input_text = "Machine learning model architecture"
module_id = "text_embeddings"
secret = "production-secret-key"

hash_value = encode(input_text, secret)

# Create embedding input record
embedding_input = EmbeddingInputInsert(
    module_id=module_id,
    input=input_text,
    hash=hash_value
)

# Submit to API
client = Client(config=config)
response = client.insert_embedding_input(embedding_input)
print(f"Embedding input ID: {response.id}")

Deduplication with Hashes

from avenieca.utils.hash import encode
from avenieca.api.model import EmbeddingInputHash
from avenieca import Client

client = Client(config=config)
secret = "your-secret-key"

# Check if input already exists
input_text = "Natural language processing"
hash_value = encode(input_text, secret)

# Query by hash to avoid duplicates
try:
    existing = client.get_embedding_input_by_hash(
        EmbeddingInputHash(hash=hash_value)
    )
    print(f"Input already exists with ID: {existing.id}")
except:
    # Input doesn't exist, create new one
    new_input = EmbeddingInputInsert(
        module_id="nlp_module",
        input=input_text,
        hash=hash_value
    )
    response = client.insert_embedding_input(new_input)
    print(f"Created new input with ID: {response.id}")

Using Different Hash Algorithms

from avenieca.utils.hash import encode

input_text = "Deep learning neural networks"
secret = "my-secret"

# SHA-256 (default, recommended)
hash_sha256 = encode(input_text, secret, algorithm="sha256")
print(f"SHA-256: {hash_sha256}")

# SHA-512 (more secure, longer hash)
hash_sha512 = encode(input_text, secret, algorithm="sha512")
print(f"SHA-512: {hash_sha512}")

# SHA3-256 (newer standard)
hash_sha3 = encode(input_text, secret, algorithm="sha3_256")
print(f"SHA3-256: {hash_sha3}")

Consistent Hashing Across Sessions

import os
from avenieca.utils.hash import encode

# Use environment variable for consistent secret
SECRET_KEY = os.getenv("AVENI_HASH_SECRET", "default-secret")

def get_input_hash(text: str) -> str:
    """Generate consistent hash for any input text."""
    return encode(text, SECRET_KEY, algorithm="sha256")

# These will produce the same hash every time
hash1 = get_input_hash("Hello, world!")
hash2 = get_input_hash("Hello, world!")
assert hash1 == hash2  # True

# Different inputs produce different hashes
hash3 = get_input_hash("Goodbye, world!")
assert hash1 != hash3  # True

Security Best Practices

Keep your secret key secure and never commit it to version control. Use environment variables or secure configuration management.

Environment-Based Secrets

import os
from avenieca.utils.hash import encode

# Good - Load secret from environment
SECRET = os.getenv("AVENI_HASH_SECRET")
if not SECRET:
    raise ValueError("AVENI_HASH_SECRET environment variable not set")

hash_value = encode("my input", SECRET)

# Bad - Hardcoded secret (DON'T DO THIS)
hash_value = encode("my input", "hardcoded-secret-123")  # Never hardcode!

Consistent Secret Across Environments

import os
from avenieca.utils.hash import encode

class HashConfig:
    """Centralized hash configuration."""
    
    def __init__(self):
        self.secret = os.getenv("AVENI_HASH_SECRET")
        if not self.secret:
            raise ValueError("Hash secret not configured")
        self.algorithm = os.getenv("AVENI_HASH_ALGO", "sha256")
    
    def hash_input(self, text: str) -> str:
        return encode(text, self.secret, self.algorithm)

# Use throughout your application
config = HashConfig()
hash1 = config.hash_input("first input")
hash2 = config.hash_input("second input")

Common Use Cases

Caching Embeddings

from avenieca.utils.hash import encode
import json

# Cache embeddings by hash to avoid recomputation
embedding_cache = {}
secret = "cache-secret"

def get_or_compute_embedding(text: str, model):
    """Get cached embedding or compute new one."""
    hash_key = encode(text, secret)
    
    if hash_key in embedding_cache:
        print("Cache hit")
        return embedding_cache[hash_key]
    
    print("Cache miss - computing embedding")
    embedding = model.encode(text)
    embedding_cache[hash_key] = embedding.tolist()
    
    return embedding_cache[hash_key]

# First call computes
embed1 = get_or_compute_embedding("Hello world", model)

# Second call uses cache
embed2 = get_or_compute_embedding("Hello world", model)

Data Integrity Verification

from avenieca.utils.hash import encode

def verify_input_integrity(original_text: str, received_hash: str, secret: str) -> bool:
    """Verify that text matches its hash."""
    computed_hash = encode(original_text, secret)
    return computed_hash == received_hash

# Example
original = "Important data"
stored_hash = encode(original, "secret")

# Later, verify integrity
if verify_input_integrity(original, stored_hash, "secret"):
    print("Data integrity verified")
else:
    print("Warning: Data has been modified")

Algorithm Selection Guide

AlgorithmHash LengthUse Case
sha25644 charsRecommended - Good balance of security and performance
sha51288 charsMaximum security, longer hashes
sha3_25644 charsNewer standard, same security as SHA-256
sha128 charsLegacy support only (less secure)
md524 charsNot recommended (weak security)
Use sha256 (the default) for most applications. It provides strong security and is widely supported.

Build docs developers (and LLMs) love