Use this file to discover all available pages before exploring further.
The PubChem RAG (Retrieval-Augmented Generation) module enriches chemistry queries by automatically fetching relevant compound information from the PubChem database.
The RAG pipeline extracts chemistry terms from your query, retrieves compound data from PubChem, and uses this context to provide more informed responses.
from plan_execute_agent.pubchem_rag.query_chemistry import query_chemistry_relatedquery = "Tell me about caffeine and its effects"response = query_chemistry_related(query)print(response)
The extract_terms.py module uses SpaCy to identify chemistry-related terms:
# From plan_execute_agent/pubchem_rag/extract_terms.py:7import spacynlp = spacy.load("en_core_web_sm")def extract_chemistry_terms(query_text: str): """ Extract chemistry-related terms from the query using SpaCy. """ doc = nlp(query_text) # Initial filter for nouns and proper nouns through spacy terms = [ token.text for token in doc if token.pos_ in {"NOUN", "PROPN"} and len(token.text) > 2 ] return terms
Example:
from plan_execute_agent.pubchem_rag.extract_terms import extract_chemistry_termsquery = "What is the molecular weight of caffeine and theophylline?"terms = extract_chemistry_terms(query)print(terms)# Output: ['weight', 'caffeine', 'theophylline']
# From plan_execute_agent/pubchem_rag/query_chemistry.py:13from .extract_terms import extract_chemistry_termsfrom .pubchem_fetcher import fetch_pubchem_datafrom .llm_response import generate_llm_responsedef query_chemistry_related(query_text: str): """ Parse the question, query PubChem for related terms, and generate an LLM response. """ # Step 1: Parse chemistry-related terms from the question chemistry_terms = extract_chemistry_terms(query_text) if not chemistry_terms: return "No chemistry-related terms found in the query." print(f"Extracted Chemistry Terms: {chemistry_terms}") # Step 2: Query PubChem for information on the terms pubchem_context = fetch_pubchem_data(chemistry_terms) if not pubchem_context: return "No relevant information found on PubChem." # Step 3: Use the fetched context in the LLM response = generate_llm_response(pubchem_context, query_text) return response
When using process_input() with RAG enabled (plan_execute_agent/rdkit_agent.py:334):
async def process_input( input_prompt: str, image_path: str = None, use_rag: bool = False) -> tuple: # ... # Perform PUBCHEM_RAG on the original input query from plan_execute_agent.pubchem_rag.query_chemistry import ( query_chemistry_related, ) additional_info = "" if use_rag: additional_info = await asyncio.to_thread( query_chemistry_related, input_prompt + "\n" + extracted_text, ) try: additional_info = str(additional_info["text"]) except: additional_info = str(additional_info) print("Additional Info from PubChem RAG: ", additional_info) # The additional_info is then included in the agent prompt edited_prompt = ( # ... + "\nHere is the additional information from PubChem regarding the original query:\n" + additional_info # ... )
from plan_execute_agent.pubchem_rag.query_chemistry import query_chemistry_relatedquery = "What is the molecular weight and solubility of ibuprofen?"response = query_chemistry_related(query)print(response)
Use both RAG and image extraction for comprehensive analysis:
import asynciofrom plan_execute_agent.rdkit_agent import process_inputquery = "What are the properties of this molecule?"image_path = "unknown_compound.png"result, completed, attempts, _, errors, _ = \ asyncio.run(process_input(query, image_path=image_path, use_rag=True))print(result)# Workflow:# 1. Extract structure from image (GPT-4o)# 2. Convert to chemical name# 3. Fetch PubChem data about the compound# 4. Generate comprehensive response
You can customize term extraction for domain-specific needs:
import spacyfrom typing import Listnlp = spacy.load("en_core_web_sm")def extract_custom_terms(query_text: str, entity_types: List[str] = None) -> List[str]: """ Extract terms with custom entity type filtering Args: query_text: Input query entity_types: SpaCy entity types to include (default: all nouns/proper nouns) """ doc = nlp(query_text) terms = set() # Add nouns and proper nouns for token in doc: if token.pos_ in {"NOUN", "PROPN"} and len(token.text) > 2: terms.add(token.text) # Add named entities if specified if entity_types: for ent in doc.ents: if ent.label_ in entity_types: terms.add(ent.text) return list(terms)# Example: Extract only chemical entitiesquery = "Compare the efficacy of aspirin and ibuprofen for pain relief"terms = extract_custom_terms(query, entity_types=["CHEMICAL", "DRUG"])print(terms)
from plan_execute_agent.pubchem_rag.query_chemistry import query_chemistry_relateddef safe_rag_query(query_text: str, fallback: str = None): """ Query with error handling and fallback """ try: response = query_chemistry_related(query_text) # Check for empty response if not response or response == "No chemistry-related terms found in the query.": return fallback or "No information found" return response except Exception as e: print(f"RAG query failed: {e}") return fallback or "Query failed"# Usageresult = safe_rag_query( "What is the structure of XYZ123?", fallback="Compound not found in PubChem")print(result)