From 694115a67ca8672b3cbb3200f67f209ff7c9a650 Mon Sep 17 00:00:00 2001 From: mishaxmishra Date: Tue, 30 Sep 2025 13:59:49 +0530 Subject: [PATCH 1/2] Removing Chroma DB dependency --- pyproject.toml | 2 +- requirements.txt | 1 - setup.py | 2 - taskgen/__init__.py | 2 - taskgen/memory.py | 604 ++++++++++++++++++++++---------------------- 5 files changed, 304 insertions(+), 307 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 52a7508..d1e1d10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ classifiers = [ ] dependencies = ["openai>=1.59.6", "langchain", "dill>=0.3.9", "termcolor>=3.1.0", "requests", -"pypdf~=6.0.0", "python-docx", "pandas", "xlrd", "chromadb>=1.0.15", +"pypdf~=6.0.0", "python-docx", "pandas", "xlrd", "asyncio", "opentelemetry-sdk~=1.32.1"] [project.urls] diff --git a/requirements.txt b/requirements.txt index b9a7f6d..42c2d14 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,6 @@ pypdf~=6.0.0 python-docx pandas xlrd -chromadb>=1.0.15 asyncio #this specific version due to pycti in sa opentelemetry-sdk~=1.32.1 diff --git a/setup.py b/setup.py index 91998d4..bd38d69 100644 --- a/setup.py +++ b/setup.py @@ -13,9 +13,7 @@ "pypdf~=6.0.0", "python-docx", "pandas", - "chromadb", "xlrd", - "chromadb>=1.0.15", "asyncio", ], ) diff --git a/taskgen/__init__.py b/taskgen/__init__.py index bc040f2..59c1711 100644 --- a/taskgen/__init__.py +++ b/taskgen/__init__.py @@ -23,8 +23,6 @@ "Memory", "MemoryTemplate", "AsyncMemory", - "ChromaDbMemory", - "AsyncChromaDbMemory", "Agent", "AsyncAgent", ] diff --git a/taskgen/memory.py b/taskgen/memory.py index fc3205a..934fbc2 100644 --- a/taskgen/memory.py +++ b/taskgen/memory.py @@ -7,10 +7,10 @@ import pypdf from docx import Document from langchain_text_splitters import RecursiveCharacterTextSplitter -import chromadb -from chromadb.api.async_client import AsyncClient -from chromadb.api.client import Client -from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction +# import chromadb +# from chromadb.api.async_client import AsyncClient +# from chromadb.api.client import Client +# from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction import copy import pandas as pd @@ -303,6 +303,8 @@ async def retrieve_by_llm(self, task: str) -> list: return [self.memory[index] for index in top_k_indices] +############################DISABLING CHROMA DB FOR NOW############################ + ###################### ## Chroma vector db ## ###################### @@ -310,300 +312,300 @@ async def retrieve_by_llm(self, task: str) -> list: ## TODO: Make this non-OpenAI dependent -class BaseChromaDbMemory(MemoryTemplate): - """Takes in the following parameters: - `collection_name`: str. Compulsory. Name of the memory. Need to provide a unique name so that we can disambiguate between collections - `client` - Default: None. ChromaDB client to use, if any - `embedding_model`: Name of OpenAI's embedding_model to use with ChromaDB. Default OpenAI "text-embedding-3-small" - `top_k`: Number of elements to retrieve. Default: 3 - `mapper`: Function. Maps the memory value to the embedded value. We do not need to embed the whole value, so this will serve as a way to tell us what to embed. Default: lambda x: x - `pre_delete`: Bool. Default: False. If set to True, delete collection with all data inside it when initialising - """ - - def __init__( - self, - collection_name, - client=None, - embedding_model="text-embedding-3-small", - top_k=3, - mapper=lambda x: x, - pre_delete=False, - ): - # Evaluate async client for chroma db for storage - self.client = client or chromadb.PersistentClient() - self.embedding_model = embedding_model - self.top_k = top_k - self.embedding_function = OpenAIEmbeddingFunction( - api_key=os.environ.get("OPENAI_API_KEY"), model_name=self.embedding_model - ) - self.salt = os.urandom(16).hex() - self.collection_name = collection_name - self.mapper = mapper - self.collection = None - if pre_delete: - self.reset() - if isinstance(self.client, Client): - self.collection = self.client.get_or_create_collection( - self.collection_name, embedding_function=self.embedding_function - ) - - @abstractmethod - def get_openai_client(self): - pass - - @abstractmethod - def create_embedding(self, text): - pass - - @abstractmethod - def get_or_create_collection(self): - pass - - def remove(self, memories: list[str]): - if not isinstance(memories, list): - memories = [memories] - max_count = self.collection.count() - for memory in memories: - # retrieve up to top_k memories to remove - retrieved_ = self.collection.query( - query_texts=[memory], n_results=min(self.top_k, max_count) - ) - removed = False - for document, retrieved_id in zip( - retrieved_["documents"][0], retrieved_["ids"][0] - ): - # only remove on exact match - if document == memory: - self.collection.delete([retrieved_id]) - print(f"Removing memory: {memory}") - removed = True - if not removed: - print(f"No memory to remove: {memory}") - - def remove_by_id(self, ids): - if not isinstance(ids, list): - ids = [ids] - return self.collection.delete(ids) - - def generate_id(self, embedding): - # Generate a unique ID based on the embedding with added salt - current_time = str(time.time()).encode() - salted_embedding = str(embedding).encode() + self.salt.encode() + current_time - return hashlib.sha256(salted_embedding).hexdigest() - - def reset(self): - return self.client.delete_collection(name=self.collection.name) - - def retrieve(self, task: str, filter=[]): - max_count = self.collection.count() - results = self.collection.query( - query_texts=[task], n_results=min(self.top_k, max_count), where=filter - )["metadatas"][0] - - return [ - mem_item["taskgen_content"] if "taskgen_content" in mem_item else mem_item - for mem_item in results - ] - # return self.collection.query( - # query_texts=[task], n_results=self.top_k, where=filter - # ) - - -class ChromaDbMemory(BaseChromaDbMemory): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.openai_client = self.get_openai_client() - self.collection = self.get_or_create_collection() - - def get_openai_client(self): - from openai import OpenAI - - return OpenAI() - - def get_or_create_collection(self): - return self.client.get_or_create_collection( - self.collection_name, embedding_function=self.embedding_function - ) - - def add_file(self, filepath, text_splitter=None): - new_memories = self.read_file(filepath, text_splitter) - return self.append(new_memories, mapper=lambda x: x["content"]) - - def create_embedding(self, text): - return ( - self.openai_client.embeddings.create( - input=[text], model=self.embedding_model - ) - .data[0] - .embedding - ) - - def append(self, new_memories, mapper=None): - if not isinstance(new_memories, list): - new_memories = [new_memories] - - if mapper: - memory_strings = [mapper(memory) for memory in new_memories] - else: - memory_strings = [self.mapper(memory) for memory in new_memories] - - if all(isinstance(item, dict) for item in new_memories): - metadatas = new_memories - else: - metadatas = [{"taskgen_content": item} for item in new_memories] - - return self.append_memory_list(memory_strings, metadatas) - - def append_memory_list(self, new_memories: list[str], metadatas: list[dict] = None): - embeddings = [self.create_embedding(text) for text in new_memories] - if metadatas is None: - metadatas = [{} for _ in new_memories] - ids = [ - metadata.get("id") or self.generate_id(embedding) - for metadata, embedding in zip(metadatas, embeddings) - ] - return self.collection.upsert( - ids=ids, - embeddings=embeddings, - documents=new_memories, - metadatas=metadatas, - ) - - -class AsyncChromaDbMemory(BaseChromaDbMemory): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.openai_client = self.get_openai_client() - - def get_openai_client(self): - from openai import AsyncOpenAI - - return AsyncOpenAI() - - async def get_or_create_collection(self): - if self.collection: - return self.collection - if isinstance(self.client, Client): - return self.client.get_or_create_collection( - self.collection_name, embedding_function=self.embedding_function - ) - elif isinstance(self.client, AsyncClient): - return await self.client.get_or_create_collection( - self.collection_name, embedding_function=self.embedding_function - ) - - async def add_file(self, filepath, text_splitter=None): - new_memories = self.read_file(filepath, text_splitter) - return await self.append(new_memories, mapper=lambda x: x["content"]) - - async def create_embedding(self, text): - embedding = await self.openai_client.embeddings.create( - input=[text], model=self.embedding_model - ) - return embedding.data[0].embedding - - async def append(self, new_memories=[], mapper=None): - if not isinstance(new_memories, list): - new_memories = [new_memories] - - if mapper: - memory_strings = [mapper(memory) for memory in new_memories] - else: - memory_strings = [self.mapper(memory) for memory in new_memories] - - if all(isinstance(item, dict) for item in new_memories): - metadatas = new_memories - else: - metadatas = [{"taskgen_content": item} for item in new_memories] - - return await self.append_memory_list(memory_strings, metadatas) - - async def append_memory_list( - self, new_memories: list[str], metadatas: list[dict] = None - ): - self.collection = await self.get_or_create_collection() - embeddings = await asyncio.gather( - *[self.create_embedding(text) for text in new_memories] - ) - if metadatas is None: - metadatas = [{} for _ in new_memories] - ids = [ - metadata.get("id") or self.generate_id(embedding) - for metadata, embedding in zip(metadatas, embeddings) - ] - if isinstance(self.client, AsyncClient): - return await self.collection.upsert( - ids=ids, - embeddings=embeddings, - documents=new_memories, - metadatas=metadatas, - ) - elif isinstance(self.client, Client): - return self.collection.upsert( - ids=ids, - embeddings=embeddings, - documents=new_memories, - metadatas=metadatas, - ) - else: - raise Exception("Unknown client type") - - async def remove(self, memories: list[str]): - self.collection = await self.get_or_create_collection() - if isinstance(self.client, AsyncClient): - if not isinstance(memories, list): - memories = [memories] - max_count = self.collection.count() - n_results_num = min(self.top_k, max_count) - for memory in memories: - # retrieve up to top_k memories to remove - retrieved_ = await self.collection.query( - query_texts=[memory], n_results=n_results_num - ) - removed = False - for document, retrieved_id in zip( - retrieved_["documents"][0], retrieved_["ids"][0] - ): - # only remove on exact match - if document == memory: - await self.collection.delete([retrieved_id]) - print(f"Removing memory: {memory}") - removed = True - if not removed: - print(f"No memory to remove: {memory}") - else: - return super().remove(memories) - - async def remove_by_id(self, ids=[]): - self.collection = await self.get_or_create_collection() - if isinstance(self.client, AsyncClient): - return await self.collection.delete(ids) - else: - return super().remove(ids) - - async def reset(self): - self.collection = await self.get_or_create_collection() - if isinstance(self.client, AsyncClient): - return await self.client.delete_collection(name=self.collection.name) - return super().reset() - - async def retrieve(self, task: str, filter=[]): - self.collection = await self.get_or_create_collection() - if isinstance(self.client, AsyncClient): - max_count = self.collection.count() - n_results_num = min(self.top_k, max_count) - results = await self.collection.query( - query_texts=[task], n_results=n_results_num, where=filter - )["metadatas"][0] - - return [ - ( - mem_item["taskgen_content"] - if "taskgen_content" in mem_item - else mem_item - ) - for mem_item in results - ] - - else: - return super().retrieve(task, filter) +# class BaseChromaDbMemory(MemoryTemplate): +# """Takes in the following parameters: +# `collection_name`: str. Compulsory. Name of the memory. Need to provide a unique name so that we can disambiguate between collections +# `client` - Default: None. ChromaDB client to use, if any +# `embedding_model`: Name of OpenAI's embedding_model to use with ChromaDB. Default OpenAI "text-embedding-3-small" +# `top_k`: Number of elements to retrieve. Default: 3 +# `mapper`: Function. Maps the memory value to the embedded value. We do not need to embed the whole value, so this will serve as a way to tell us what to embed. Default: lambda x: x +# `pre_delete`: Bool. Default: False. If set to True, delete collection with all data inside it when initialising +# """ + +# def __init__( +# self, +# collection_name, +# client=None, +# embedding_model="text-embedding-3-small", +# top_k=3, +# mapper=lambda x: x, +# pre_delete=False, +# ): +# # Evaluate async client for chroma db for storage +# self.client = client or chromadb.PersistentClient() +# self.embedding_model = embedding_model +# self.top_k = top_k +# self.embedding_function = OpenAIEmbeddingFunction( +# api_key=os.environ.get("OPENAI_API_KEY"), model_name=self.embedding_model +# ) +# self.salt = os.urandom(16).hex() +# self.collection_name = collection_name +# self.mapper = mapper +# self.collection = None +# if pre_delete: +# self.reset() +# if isinstance(self.client, Client): +# self.collection = self.client.get_or_create_collection( +# self.collection_name, embedding_function=self.embedding_function +# ) + +# @abstractmethod +# def get_openai_client(self): +# pass + +# @abstractmethod +# def create_embedding(self, text): +# pass + +# @abstractmethod +# def get_or_create_collection(self): +# pass + +# def remove(self, memories: list[str]): +# if not isinstance(memories, list): +# memories = [memories] +# max_count = self.collection.count() +# for memory in memories: +# # retrieve up to top_k memories to remove +# retrieved_ = self.collection.query( +# query_texts=[memory], n_results=min(self.top_k, max_count) +# ) +# removed = False +# for document, retrieved_id in zip( +# retrieved_["documents"][0], retrieved_["ids"][0] +# ): +# # only remove on exact match +# if document == memory: +# self.collection.delete([retrieved_id]) +# print(f"Removing memory: {memory}") +# removed = True +# if not removed: +# print(f"No memory to remove: {memory}") + +# def remove_by_id(self, ids): +# if not isinstance(ids, list): +# ids = [ids] +# return self.collection.delete(ids) + +# def generate_id(self, embedding): +# # Generate a unique ID based on the embedding with added salt +# current_time = str(time.time()).encode() +# salted_embedding = str(embedding).encode() + self.salt.encode() + current_time +# return hashlib.sha256(salted_embedding).hexdigest() + +# def reset(self): +# return self.client.delete_collection(name=self.collection.name) + +# def retrieve(self, task: str, filter=[]): +# max_count = self.collection.count() +# results = self.collection.query( +# query_texts=[task], n_results=min(self.top_k, max_count), where=filter +# )["metadatas"][0] + +# return [ +# mem_item["taskgen_content"] if "taskgen_content" in mem_item else mem_item +# for mem_item in results +# ] +# # return self.collection.query( +# # query_texts=[task], n_results=self.top_k, where=filter +# # ) + + +# class ChromaDbMemory(BaseChromaDbMemory): +# def __init__(self, *args, **kwargs): +# super().__init__(*args, **kwargs) +# self.openai_client = self.get_openai_client() +# self.collection = self.get_or_create_collection() + +# def get_openai_client(self): +# from openai import OpenAI + +# return OpenAI() + +# def get_or_create_collection(self): +# return self.client.get_or_create_collection( +# self.collection_name, embedding_function=self.embedding_function +# ) + +# def add_file(self, filepath, text_splitter=None): +# new_memories = self.read_file(filepath, text_splitter) +# return self.append(new_memories, mapper=lambda x: x["content"]) + +# def create_embedding(self, text): +# return ( +# self.openai_client.embeddings.create( +# input=[text], model=self.embedding_model +# ) +# .data[0] +# .embedding +# ) + +# def append(self, new_memories, mapper=None): +# if not isinstance(new_memories, list): +# new_memories = [new_memories] + +# if mapper: +# memory_strings = [mapper(memory) for memory in new_memories] +# else: +# memory_strings = [self.mapper(memory) for memory in new_memories] + +# if all(isinstance(item, dict) for item in new_memories): +# metadatas = new_memories +# else: +# metadatas = [{"taskgen_content": item} for item in new_memories] + +# return self.append_memory_list(memory_strings, metadatas) + +# def append_memory_list(self, new_memories: list[str], metadatas: list[dict] = None): +# embeddings = [self.create_embedding(text) for text in new_memories] +# if metadatas is None: +# metadatas = [{} for _ in new_memories] +# ids = [ +# metadata.get("id") or self.generate_id(embedding) +# for metadata, embedding in zip(metadatas, embeddings) +# ] +# return self.collection.upsert( +# ids=ids, +# embeddings=embeddings, +# documents=new_memories, +# metadatas=metadatas, +# ) + + +# class AsyncChromaDbMemory(BaseChromaDbMemory): +# def __init__(self, *args, **kwargs): +# super().__init__(*args, **kwargs) +# self.openai_client = self.get_openai_client() + +# def get_openai_client(self): +# from openai import AsyncOpenAI + +# return AsyncOpenAI() + +# async def get_or_create_collection(self): +# if self.collection: +# return self.collection +# if isinstance(self.client, Client): +# return self.client.get_or_create_collection( +# self.collection_name, embedding_function=self.embedding_function +# ) +# elif isinstance(self.client, AsyncClient): +# return await self.client.get_or_create_collection( +# self.collection_name, embedding_function=self.embedding_function +# ) + +# async def add_file(self, filepath, text_splitter=None): +# new_memories = self.read_file(filepath, text_splitter) +# return await self.append(new_memories, mapper=lambda x: x["content"]) + +# async def create_embedding(self, text): +# embedding = await self.openai_client.embeddings.create( +# input=[text], model=self.embedding_model +# ) +# return embedding.data[0].embedding + +# async def append(self, new_memories=[], mapper=None): +# if not isinstance(new_memories, list): +# new_memories = [new_memories] + +# if mapper: +# memory_strings = [mapper(memory) for memory in new_memories] +# else: +# memory_strings = [self.mapper(memory) for memory in new_memories] + +# if all(isinstance(item, dict) for item in new_memories): +# metadatas = new_memories +# else: +# metadatas = [{"taskgen_content": item} for item in new_memories] + +# return await self.append_memory_list(memory_strings, metadatas) + +# async def append_memory_list( +# self, new_memories: list[str], metadatas: list[dict] = None +# ): +# self.collection = await self.get_or_create_collection() +# embeddings = await asyncio.gather( +# *[self.create_embedding(text) for text in new_memories] +# ) +# if metadatas is None: +# metadatas = [{} for _ in new_memories] +# ids = [ +# metadata.get("id") or self.generate_id(embedding) +# for metadata, embedding in zip(metadatas, embeddings) +# ] +# if isinstance(self.client, AsyncClient): +# return await self.collection.upsert( +# ids=ids, +# embeddings=embeddings, +# documents=new_memories, +# metadatas=metadatas, +# ) +# elif isinstance(self.client, Client): +# return self.collection.upsert( +# ids=ids, +# embeddings=embeddings, +# documents=new_memories, +# metadatas=metadatas, +# ) +# else: +# raise Exception("Unknown client type") + +# async def remove(self, memories: list[str]): +# self.collection = await self.get_or_create_collection() +# if isinstance(self.client, AsyncClient): +# if not isinstance(memories, list): +# memories = [memories] +# max_count = self.collection.count() +# n_results_num = min(self.top_k, max_count) +# for memory in memories: +# # retrieve up to top_k memories to remove +# retrieved_ = await self.collection.query( +# query_texts=[memory], n_results=n_results_num +# ) +# removed = False +# for document, retrieved_id in zip( +# retrieved_["documents"][0], retrieved_["ids"][0] +# ): +# # only remove on exact match +# if document == memory: +# await self.collection.delete([retrieved_id]) +# print(f"Removing memory: {memory}") +# removed = True +# if not removed: +# print(f"No memory to remove: {memory}") +# else: +# return super().remove(memories) + +# async def remove_by_id(self, ids=[]): +# self.collection = await self.get_or_create_collection() +# if isinstance(self.client, AsyncClient): +# return await self.collection.delete(ids) +# else: +# return super().remove(ids) + +# async def reset(self): +# self.collection = await self.get_or_create_collection() +# if isinstance(self.client, AsyncClient): +# return await self.client.delete_collection(name=self.collection.name) +# return super().reset() + +# async def retrieve(self, task: str, filter=[]): +# self.collection = await self.get_or_create_collection() +# if isinstance(self.client, AsyncClient): +# max_count = self.collection.count() +# n_results_num = min(self.top_k, max_count) +# results = await self.collection.query( +# query_texts=[task], n_results=n_results_num, where=filter +# )["metadatas"][0] + +# return [ +# ( +# mem_item["taskgen_content"] +# if "taskgen_content" in mem_item +# else mem_item +# ) +# for mem_item in results +# ] + +# else: +# return super().retrieve(task, filter) From a94874abc0b1a2d0a0ca683cbe863a8040eb1866 Mon Sep 17 00:00:00 2001 From: mishaxmishra Date: Tue, 30 Sep 2025 14:09:28 +0530 Subject: [PATCH 2/2] Bumped version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d1e1d10..fceed7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "taskgen-ai" -version = "3.3.11" +version = "3.4.0" authors = [ { name="John Tan Chong Min", email="tanchongmin@gmail.com" }, ] diff --git a/setup.py b/setup.py index bd38d69..bacc621 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="taskgen", - version="3.3.11", + version="3.4.0", packages=find_packages(), install_requires=[ "openai>=1.59.6",