diff --git a/pyproject.toml b/pyproject.toml index fceed7d..e3cab9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "taskgen-ai" -version = "3.4.0" +version = "3.4.1" authors = [ { name="John Tan Chong Min", email="tanchongmin@gmail.com" }, ] @@ -18,7 +18,7 @@ classifiers = [ ] dependencies = ["openai>=1.59.6", "langchain", "dill>=0.3.9", "termcolor>=3.1.0", "requests", -"pypdf~=6.0.0", "python-docx", "pandas", "xlrd", +"python-docx", "pandas", "xlrd", "asyncio", "opentelemetry-sdk~=1.32.1"] [project.urls] diff --git a/requirements.txt b/requirements.txt index 42c2d14..e513baa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ langchain dill>=0.3.9 termcolor>=3.1.0 requests -pypdf~=6.0.0 python-docx pandas xlrd diff --git a/setup.py b/setup.py index bacc621..bc59078 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="taskgen", - version="3.4.0", + version="3.4.1", packages=find_packages(), install_requires=[ "openai>=1.59.6", @@ -10,7 +10,6 @@ "dill>=0.3.9", "termcolor>=3.1.0", "requests", - "pypdf~=6.0.0", "python-docx", "pandas", "xlrd", diff --git a/taskgen/memory.py b/taskgen/memory.py index 934fbc2..4f25a4d 100644 --- a/taskgen/memory.py +++ b/taskgen/memory.py @@ -4,7 +4,6 @@ import os import time from typing import Any -import pypdf from docx import Document from langchain_text_splitters import RecursiveCharacterTextSplitter # import chromadb @@ -58,8 +57,8 @@ def read_file(self, filepath, text_splitter=None): text = pd.read_csv(filepath).to_string() elif ".docx" in filepath: text = self.read_docx(filepath) - elif ".pdf" in filepath: - text = self.read_pdf(filepath) + # elif ".pdf" in filepath: + # text = self.read_pdf(filepath) else: raise ValueError( "File type not spported, supported file types: pdf, docx, csv, xls" @@ -78,18 +77,18 @@ def read_file(self, filepath, text_splitter=None): memories = [{"content": text, "filepath": filepath} for text in texts] return memories - def read_pdf(self, filepath): - # Open the PDF file - text_list = [] - with open(filepath, "rb") as file: - pdf_reader = pypdf.PdfReader(file) - for page in pdf_reader.pages: - page_text = page.extract_text() - if page_text: # Ensure there's text on the page - text_list.append(page_text) - else: - print("No text found on page") - return "\n".join(text_list) + # def read_pdf(self, filepath): + # # Open the PDF file + # text_list = [] + # with open(filepath, "rb") as file: + # pdf_reader = pypdf.PdfReader(file) + # for page in pdf_reader.pages: + # page_text = page.extract_text() + # if page_text: # Ensure there's text on the page + # text_list.append(page_text) + # else: + # print("No text found on page") + # return "\n".join(text_list) def read_docx(self, filepath): doc = Document(filepath)