RAG on your huggingface_doc data using chromadb and groq api (#235)
* RAG on your PDF data using chromadb and groq api * Multiple embeeding and llm support * Multiple embedding and llm support * Default embeddings set to hugging face * organize imports * huggingface_doc data source added * Update and rename Local_PDF_RAG_using_chromadb.py to rag_using_chromadb.py * Quality fix * Default agent set to CodeAgent
This commit is contained in:
parent
a806f50ef2
commit
7e9f6e5edb
|
@ -0,0 +1,130 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
import datasets
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
from langchain_chroma import Chroma
|
||||||
|
|
||||||
|
# from langchain_community.document_loaders import PyPDFLoader
|
||||||
|
from langchain_huggingface import HuggingFaceEmbeddings
|
||||||
|
from tqdm import tqdm
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
# from langchain_openai import OpenAIEmbeddings
|
||||||
|
from smolagents import LiteLLMModel, Tool
|
||||||
|
from smolagents.agents import CodeAgent
|
||||||
|
|
||||||
|
|
||||||
|
# from smolagents.agents import ToolCallingAgent
|
||||||
|
|
||||||
|
|
||||||
|
knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
|
||||||
|
|
||||||
|
source_docs = [
|
||||||
|
Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) for doc in knowledge_base
|
||||||
|
]
|
||||||
|
|
||||||
|
## For your own PDFs, you can use the following code to load them into source_docs
|
||||||
|
# pdf_directory = "pdfs"
|
||||||
|
# pdf_files = [
|
||||||
|
# os.path.join(pdf_directory, f)
|
||||||
|
# for f in os.listdir(pdf_directory)
|
||||||
|
# if f.endswith(".pdf")
|
||||||
|
# ]
|
||||||
|
# source_docs = []
|
||||||
|
|
||||||
|
# for file_path in pdf_files:
|
||||||
|
# loader = PyPDFLoader(file_path)
|
||||||
|
# docs.extend(loader.load())
|
||||||
|
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
||||||
|
AutoTokenizer.from_pretrained("thenlper/gte-small"),
|
||||||
|
chunk_size=200,
|
||||||
|
chunk_overlap=20,
|
||||||
|
add_start_index=True,
|
||||||
|
strip_whitespace=True,
|
||||||
|
separators=["\n\n", "\n", ".", " ", ""],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Split docs and keep only unique ones
|
||||||
|
print("Splitting documents...")
|
||||||
|
docs_processed = []
|
||||||
|
unique_texts = {}
|
||||||
|
for doc in tqdm(source_docs):
|
||||||
|
new_docs = text_splitter.split_documents([doc])
|
||||||
|
for new_doc in new_docs:
|
||||||
|
if new_doc.page_content not in unique_texts:
|
||||||
|
unique_texts[new_doc.page_content] = True
|
||||||
|
docs_processed.append(new_doc)
|
||||||
|
|
||||||
|
|
||||||
|
print("Embedding documents... This should take a few minutes (5 minutes on MacBook with M1 Pro)")
|
||||||
|
# Initialize embeddings and ChromaDB vector store
|
||||||
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
||||||
|
|
||||||
|
|
||||||
|
# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
||||||
|
|
||||||
|
vector_store = Chroma.from_documents(docs_processed, embeddings, persist_directory="./chroma_db")
|
||||||
|
|
||||||
|
|
||||||
|
class RetrieverTool(Tool):
|
||||||
|
name = "retriever"
|
||||||
|
description = (
|
||||||
|
"Uses semantic search to retrieve the parts of documentation that could be most relevant to answer your query."
|
||||||
|
)
|
||||||
|
inputs = {
|
||||||
|
"query": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
output_type = "string"
|
||||||
|
|
||||||
|
def __init__(self, vector_store, **kwargs):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.vector_store = vector_store
|
||||||
|
|
||||||
|
def forward(self, query: str) -> str:
|
||||||
|
assert isinstance(query, str), "Your search query must be a string"
|
||||||
|
docs = self.vector_store.similarity_search(query, k=3)
|
||||||
|
return "\nRetrieved documents:\n" + "".join(
|
||||||
|
[f"\n\n===== Document {str(i)} =====\n" + doc.page_content for i, doc in enumerate(docs)]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
retriever_tool = RetrieverTool(vector_store)
|
||||||
|
|
||||||
|
# Choose which LLM engine to use!
|
||||||
|
|
||||||
|
# from smolagents import HfApiModel
|
||||||
|
# model = HfApiModel(model_id="meta-llama/Llama-3.3-70B-Instruct")
|
||||||
|
|
||||||
|
# from smolagents import TransformersModel
|
||||||
|
# model = TransformersModel(model_id="meta-llama/Llama-3.2-2B-Instruct")
|
||||||
|
|
||||||
|
# For anthropic: change model_id below to 'anthropic/claude-3-5-sonnet-20240620' and also change 'os.environ.get("ANTHROPIC_API_KEY")'
|
||||||
|
model = LiteLLMModel(
|
||||||
|
model_id="groq/llama-3.3-70b-versatile",
|
||||||
|
api_key=os.environ.get("GROQ_API_KEY"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# # You can also use the ToolCallingAgent class
|
||||||
|
# agent = ToolCallingAgent(
|
||||||
|
# tools=[retriever_tool],
|
||||||
|
# model=model,
|
||||||
|
# verbose=True,
|
||||||
|
# )
|
||||||
|
|
||||||
|
agent = CodeAgent(
|
||||||
|
tools=[retriever_tool],
|
||||||
|
model=model,
|
||||||
|
max_steps=4,
|
||||||
|
verbosity_level=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
agent_output = agent.run("How can I push a model to the Hub?")
|
||||||
|
|
||||||
|
|
||||||
|
print("Final output:")
|
||||||
|
print(agent_output)
|
Loading…
Reference in New Issue