Skip to content

Commit

Permalink
fix: megaparse sdk with nats (#3496)
Browse files Browse the repository at this point in the history
* Adapt deps
* Change megaparse processor inner file processing
  • Loading branch information
chloedia authored Nov 25, 2024
1 parent a4e42b0 commit e68b4f4
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 32 deletions.
2 changes: 1 addition & 1 deletion core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ dependencies = [
"faiss-cpu>=1.8.0.post1",
"rapidfuzz>=3.10.1",
"markupsafe>=2.1.5",
"megaparse[all]== 0.0.43",
"megaparse-sdk==0.1.7"
]
readme = "README.md"
requires-python = ">= 3.11"
Expand Down
35 changes: 5 additions & 30 deletions core/quivr_core/processor/implementations/megaparse_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import tiktoken
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
from megaparse.core.megaparse import MegaParse
from megaparse.core.parser.unstructured_parser import UnstructuredParser
from megaparse_sdk.client import MegaParseNATSClient
from megaparse_sdk.config import ClientNATSConfig

from quivr_core.config import MegaparseConfig
from quivr_core.files.file import QuivrFile
Expand Down Expand Up @@ -75,9 +75,9 @@ def processor_metadata(self):

async def process_file_inner(self, file: QuivrFile) -> list[Document]:
logger.info(f"Uploading file {file.path} to MegaParse")
parser = UnstructuredParser(**self.megaparse_config.model_dump())
megaparse = MegaParse(parser)
response = await megaparse.aload(file.path)
async with MegaParseNATSClient(ClientNATSConfig()) as client:
response = await client.parse_file(file=file.path)

logger.info(f"File : {response}")
document = Document(
page_content=response,
Expand All @@ -87,28 +87,3 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]:
for doc in docs:
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
return docs

# async def process_file_inner(self, file: QuivrFile) -> list[Document]:
# api_key = str(os.getenv("MEGAPARSE_API_KEY"))
# megaparse = MegaParseSDK(api_key)
# logger.info(f"Uploading file {file.path} to MegaParse")
# data = {
# "method": self.megaparse_config.method,
# "strategy": self.megaparse_config.strategy,
# "check_table": self.megaparse_config.check_table,
# "parsing_instruction": self.megaparse_config.parsing_instruction,
# "model_name": self.megaparse_config.model_name,
# }
# response = await megaparse.file.upload(
# file_path=str(file.path),
# **data,
# )
# document = Document(
# page_content=response["result"],
# )
# if len(response) > self.splitter_config.chunk_size:
# docs = self.text_splitter.split_documents([document])
# for doc in docs:
# doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
# return docs
# return [document]
2 changes: 1 addition & 1 deletion examples/simple_question_megaparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
if __name__ == "__main__":
brain = Brain.from_files(
name="test_brain",
file_paths=["./tests/processor/docx/demo.docx"],
file_paths=["./tests/processor/pdf/sample.pdf"],
llm=LLMEndpoint(
llm_config=LLMEndpointConfig(model="gpt-4o"),
llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),
Expand Down

0 comments on commit e68b4f4

Please sign in to comment.