diff options
Diffstat (limited to 'scrape_ecourtindia_v6/translate_to_english.py')
-rw-r--r-- | scrape_ecourtindia_v6/translate_to_english.py | 42 |
1 files changed, 0 insertions, 42 deletions
diff --git a/scrape_ecourtindia_v6/translate_to_english.py b/scrape_ecourtindia_v6/translate_to_english.py deleted file mode 100644 index 485a4b8..0000000 --- a/scrape_ecourtindia_v6/translate_to_english.py +++ /dev/null @@ -1,42 +0,0 @@ -from tempfile import TemporaryDirectory - -import pytesseract -from pdf2image import convert_from_path -from PIL import Image - -from tinydb import TinyDB - -language = 'hin' - -def to_english(input_file, output_file): - image_file_list = [] - - with TemporaryDirectory() as tempdir: - pdf_pages = convert_from_path(input_file, 500) - - for page_enumeration, page in enumerate(pdf_pages, start=1): - filename = f"{tempdir}/page_{page_enumeration}.jpg" - page.save(filename, "JPEG") - image_file_list.append(filename) - - with open(output_file, "a") as h: - for image_file in image_file_list: - text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language)))) - - # In many PDFs, at line ending, if a word can't - # be written fully, a 'hyphen' is added. - # The rest of the word is written in the next line - # Eg: This is a sample text this word here GeeksF- - # orGeeks is half on first line, remaining on next. - # To remove this, we replace every '-\n' to ''. - text = text.replace("-\n", "") - - breakpoint() - - h.write(text) - -db = TinyDB('orders.json') -entries = db.all() - -for entry in entries: - to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt') |