AutoRAG/document_processor.py at main · function0553/AutoRAG · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import tempfile
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from document_classifier import DocumentClassifier

class DocumentProcessor:
    """
    文档处理类，用于处理各种格式的文档
    """

    def __init__(self):
        # 初始化文本分割器
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len
        )
        # 初始化文档分类器
        self.classifier = DocumentClassifier()

    def load_document(self, file_path, file_type):
        """
        加载不同格式的文档

        Args:
            file_path: 文件路径
            file_type: 文件类型（pdf, docx, txt等）

        Returns:
            加载后的文档对象
        """
        try:
            if file_type == "pdf":
                loader = PyPDFLoader(file_path)
            elif file_type == "docx":
                loader = Docx2txtLoader(file_path)
            elif file_type == "txt" or file_type == "md":
                loader = TextLoader(file_path, encoding="utf-8")
            else:
                raise ValueError(f"不支持的文件类型: {file_type}")

            return loader.load()
        except Exception as e:
            print(f"加载文档失败: {e}")
            raise

    def split_document(self, documents):
        """
        分割文档为小块

        Args:
            documents: 文档对象列表

        Returns:
            分割后的文档块列表
        """
        try:
            return self.text_splitter.split_documents(documents)
        except Exception as e:
            print(f"分割文档失败: {e}")
            raise

    def process_file(self, file, file_type):
        """
        处理单个文件

        Args:
            file: Streamlit上传的文件对象
            file_type: 文件类型

        Returns:
            分割后的文档块列表
        """
        # 创建临时文件
        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_type}") as temp_file:
            temp_file.write(file.read())
            temp_file_path = temp_file.name

        try:
            # 加载文档
            documents = self.load_document(temp_file_path, file_type)
            # 分割文档
            split_docs = self.split_document(documents)
            return split_docs
        finally:
            # 删除临时文件
            if os.path.exists(temp_file_path):
                os.remove(temp_file_path)

    def process_files(self, uploaded_files):
        """
        处理多个上传的文件

        Args:
            uploaded_files: Streamlit上传的文件对象列表

        Returns:
            所有分割后的文档块列表
        """
        all_split_docs = []

        for file in uploaded_files:
            # 获取文件类型
            file_type = file.name.split(".")[-1].lower()
            try:
                split_docs = self.process_file(file, file_type)
                all_split_docs.extend(split_docs)
                print(f"成功处理文件: {file.name}")
            except Exception as e:
                print(f"处理文件 {file.name} 失败: {e}")
                continue

        return all_split_docs

    def classify_documents(self, documents):
        """
        分类文档

        Args:
            documents: 文档对象列表

        Returns:
            分类后的文档列表，每个文档包含分类信息
        """
        return self.classifier.classify_documents(documents)