Skip to content

Commit

Permalink
remove language detection
Browse files Browse the repository at this point in the history
  • Loading branch information
CodingWithTim committed Jan 5, 2025
1 parent 32c6724 commit 2cb0937
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 55 deletions.
43 changes: 3 additions & 40 deletions fastchat/serve/gradio_block_arena_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,20 +233,6 @@ def wrap_pdfchat_query(query, document):


LLAMA_PARSE_MAX_RETRY = 2
TESSERACT_SUPPORTED_LANGS = "+".join(
[
"en",
"chi_tra",
"chi_sim",
"rus",
"spa",
"jpn",
"kor",
"fra",
"deu", # German
"vie",
]
)
LLAMAPARSE_SUPPORTED_LANGS = {
"English": "en",
"Chinese": "ch_sim",
Expand All @@ -260,44 +246,21 @@ def wrap_pdfchat_query(query, document):
}


def detect_language_from_doc(pdf_file_path):
from pdf2image import convert_from_path
from polyglot.detect import Detector

import pytesseract # Google's open-source OCR tool

assert os.environ[
"TESSDATA_PREFIX"
], "Make sure to specify location of train data for Tesseract."

# Convert pdf into image (first page only for efficiency)
images = convert_from_path(pdf_file_path)

extracted_text = pytesseract.image_to_string(
images[0], lang=TESSERACT_SUPPORTED_LANGS
)

languages = Detector(extracted_text, quiet=True)
# return languages
return [lang.name for lang in languages.languages if lang.name != "un"]


def parse_pdf(file_path):
from llama_parse import LlamaParse

assert (
"LLAMA_CLOUD_API_KEY" in os.environ
), "Make sure to specify LlamaParse API key."

doc_lang = detect_language_from_doc(file_path)
doc_lang = LLAMAPARSE_SUPPORTED_LANGS[doc_lang[0]]

for _ in range(LLAMA_PARSE_MAX_RETRY):
try:
documents = LlamaParse(
result_type="markdown",
verbose=True,
language=doc_lang,
languages=list(
LLAMAPARSE_SUPPORTED_LANGS.values()
),
accurate_mode=True,
).load_data(file_path)
assert len(documents) > 0
Expand Down
16 changes: 1 addition & 15 deletions fastchat/serve/setup_pdfchat.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,4 @@

# Install Python packages
pip install llama-index-core llama-parse llama-index-readers-file python-dotenv
pip install polyglot
pip install PyICU
pip install pycld2
pip install pytesseract

pip install pdf2image

# Clone the Tesseract tessdata repository
git clone https://github.com/tesseract-ocr/tessdata

# cd into tessdata and set TESSDATA_PREFIX to the current directory
cd tessdata
export TESSDATA_PREFIX="$(pwd)"

echo "TESSDATA_PREFIX is set to: $TESSDATA_PREFIX"
pip install pdf2image

0 comments on commit 2cb0937

Please sign in to comment.