From 7e9f6e5edbd73a971406864ff6de84f43e6ead23 Mon Sep 17 00:00:00 2001
From: Touseef Ahmad <124333084+touseefahmed96@users.noreply.github.com>
Date: Thu, 23 Jan 2025 13:41:43 +0500
Subject: [PATCH] RAG on your huggingface_doc data using chromadb and groq api
 (#235)

* RAG on your PDF data using chromadb and groq api

* Multiple embeeding and llm support

* Multiple embedding and llm support

* Default embeddings set to hugging face

* organize imports

* huggingface_doc data source added

* Update and rename Local_PDF_RAG_using_chromadb.py to rag_using_chromadb.py

* Quality fix

* Default agent set to CodeAgent
---
 examples/rag_using_chromadb.py | 130 +++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 examples/rag_using_chromadb.py

diff --git a/examples/rag_using_chromadb.py b/examples/rag_using_chromadb.py
new file mode 100644
index 0000000..864bfc8
--- /dev/null
+++ b/examples/rag_using_chromadb.py
@@ -0,0 +1,130 @@
+import os
+
+import datasets
+from langchain.docstore.document import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_chroma import Chroma
+
+# from langchain_community.document_loaders import PyPDFLoader
+from langchain_huggingface import HuggingFaceEmbeddings
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+# from langchain_openai import OpenAIEmbeddings
+from smolagents import LiteLLMModel, Tool
+from smolagents.agents import CodeAgent
+
+
+# from smolagents.agents import ToolCallingAgent
+
+
+knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
+
+source_docs = [
+    Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) for doc in knowledge_base
+]
+
+## For your own PDFs, you can use the following code to load them into source_docs
+# pdf_directory = "pdfs"
+# pdf_files = [
+#     os.path.join(pdf_directory, f)
+#     for f in os.listdir(pdf_directory)
+#     if f.endswith(".pdf")
+# ]
+# source_docs = []
+
+# for file_path in pdf_files:
+#     loader = PyPDFLoader(file_path)
+#     docs.extend(loader.load())
+
+text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
+    AutoTokenizer.from_pretrained("thenlper/gte-small"),
+    chunk_size=200,
+    chunk_overlap=20,
+    add_start_index=True,
+    strip_whitespace=True,
+    separators=["\n\n", "\n", ".", " ", ""],
+)
+
+# Split docs and keep only unique ones
+print("Splitting documents...")
+docs_processed = []
+unique_texts = {}
+for doc in tqdm(source_docs):
+    new_docs = text_splitter.split_documents([doc])
+    for new_doc in new_docs:
+        if new_doc.page_content not in unique_texts:
+            unique_texts[new_doc.page_content] = True
+            docs_processed.append(new_doc)
+
+
+print("Embedding documents... This should take a few minutes (5 minutes on MacBook with M1 Pro)")
+# Initialize embeddings and ChromaDB vector store
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+
+
+# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
+
+vector_store = Chroma.from_documents(docs_processed, embeddings, persist_directory="./chroma_db")
+
+
+class RetrieverTool(Tool):
+    name = "retriever"
+    description = (
+        "Uses semantic search to retrieve the parts of documentation that could be most relevant to answer your query."
+    )
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
+        }
+    }
+    output_type = "string"
+
+    def __init__(self, vector_store, **kwargs):
+        super().__init__(**kwargs)
+        self.vector_store = vector_store
+
+    def forward(self, query: str) -> str:
+        assert isinstance(query, str), "Your search query must be a string"
+        docs = self.vector_store.similarity_search(query, k=3)
+        return "\nRetrieved documents:\n" + "".join(
+            [f"\n\n===== Document {str(i)} =====\n" + doc.page_content for i, doc in enumerate(docs)]
+        )
+
+
+retriever_tool = RetrieverTool(vector_store)
+
+# Choose which LLM engine to use!
+
+# from smolagents import HfApiModel
+# model = HfApiModel(model_id="meta-llama/Llama-3.3-70B-Instruct")
+
+# from smolagents import TransformersModel
+# model = TransformersModel(model_id="meta-llama/Llama-3.2-2B-Instruct")
+
+# For anthropic: change model_id below to 'anthropic/claude-3-5-sonnet-20240620' and also change 'os.environ.get("ANTHROPIC_API_KEY")'
+model = LiteLLMModel(
+    model_id="groq/llama-3.3-70b-versatile",
+    api_key=os.environ.get("GROQ_API_KEY"),
+)
+
+# # You can also use the ToolCallingAgent class
+# agent = ToolCallingAgent(
+#     tools=[retriever_tool],
+#     model=model,
+#     verbose=True,
+# )
+
+agent = CodeAgent(
+    tools=[retriever_tool],
+    model=model,
+    max_steps=4,
+    verbosity_level=2,
+)
+
+agent_output = agent.run("How can I push a model to the Hub?")
+
+
+print("Final output:")
+print(agent_output)