diff --git a/fastchat/serve/gradio_block_arena_vision.py b/fastchat/serve/gradio_block_arena_vision.py index e5b529bff..bd745791f 100644 --- a/fastchat/serve/gradio_block_arena_vision.py +++ b/fastchat/serve/gradio_block_arena_vision.py @@ -233,20 +233,6 @@ def wrap_pdfchat_query(query, document): LLAMA_PARSE_MAX_RETRY = 2 -TESSERACT_SUPPORTED_LANGS = "+".join( - [ - "en", - "chi_tra", - "chi_sim", - "rus", - "spa", - "jpn", - "kor", - "fra", - "deu", # German - "vie", - ] -) LLAMAPARSE_SUPPORTED_LANGS = { "English": "en", "Chinese": "ch_sim", @@ -260,28 +246,6 @@ def wrap_pdfchat_query(query, document): } -def detect_language_from_doc(pdf_file_path): - from pdf2image import convert_from_path - from polyglot.detect import Detector - - import pytesseract # Google's open-source OCR tool - - assert os.environ[ - "TESSDATA_PREFIX" - ], "Make sure to specify location of train data for Tesseract." - - # Convert pdf into image (first page only for efficiency) - images = convert_from_path(pdf_file_path) - - extracted_text = pytesseract.image_to_string( - images[0], lang=TESSERACT_SUPPORTED_LANGS - ) - - languages = Detector(extracted_text, quiet=True) - # return languages - return [lang.name for lang in languages.languages if lang.name != "un"] - - def parse_pdf(file_path): from llama_parse import LlamaParse @@ -289,15 +253,14 @@ def parse_pdf(file_path): "LLAMA_CLOUD_API_KEY" in os.environ ), "Make sure to specify LlamaParse API key." - doc_lang = detect_language_from_doc(file_path) - doc_lang = LLAMAPARSE_SUPPORTED_LANGS[doc_lang[0]] - for _ in range(LLAMA_PARSE_MAX_RETRY): try: documents = LlamaParse( result_type="markdown", verbose=True, - language=doc_lang, + languages=list( + LLAMAPARSE_SUPPORTED_LANGS.values() + ), accurate_mode=True, ).load_data(file_path) assert len(documents) > 0 diff --git a/fastchat/serve/setup_pdfchat.sh b/fastchat/serve/setup_pdfchat.sh index e2d3fbc15..2511b939d 100644 --- a/fastchat/serve/setup_pdfchat.sh +++ b/fastchat/serve/setup_pdfchat.sh @@ -2,18 +2,4 @@ # Install Python packages pip install llama-index-core llama-parse llama-index-readers-file python-dotenv -pip install polyglot -pip install PyICU -pip install pycld2 -pip install pytesseract - -pip install pdf2image - -# Clone the Tesseract tessdata repository -git clone https://github.com/tesseract-ocr/tessdata - -# cd into tessdata and set TESSDATA_PREFIX to the current directory -cd tessdata -export TESSDATA_PREFIX="$(pwd)" - -echo "TESSDATA_PREFIX is set to: $TESSDATA_PREFIX" \ No newline at end of file +pip install pdf2image \ No newline at end of file