aboutsummaryrefslogtreecommitdiff
path: root/scrape_ecourtindia_v6/translate_to_english.py
diff options
context:
space:
mode:
Diffstat (limited to 'scrape_ecourtindia_v6/translate_to_english.py')
-rw-r--r--scrape_ecourtindia_v6/translate_to_english.py42
1 files changed, 0 insertions, 42 deletions
diff --git a/scrape_ecourtindia_v6/translate_to_english.py b/scrape_ecourtindia_v6/translate_to_english.py
deleted file mode 100644
index 485a4b8..0000000
--- a/scrape_ecourtindia_v6/translate_to_english.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from tempfile import TemporaryDirectory
-
-import pytesseract
-from pdf2image import convert_from_path
-from PIL import Image
-
-from tinydb import TinyDB
-
-language = 'hin'
-
-def to_english(input_file, output_file):
- image_file_list = []
-
- with TemporaryDirectory() as tempdir:
- pdf_pages = convert_from_path(input_file, 500)
-
- for page_enumeration, page in enumerate(pdf_pages, start=1):
- filename = f"{tempdir}/page_{page_enumeration}.jpg"
- page.save(filename, "JPEG")
- image_file_list.append(filename)
-
- with open(output_file, "a") as h:
- for image_file in image_file_list:
- text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language))))
-
- # In many PDFs, at line ending, if a word can't
- # be written fully, a 'hyphen' is added.
- # The rest of the word is written in the next line
- # Eg: This is a sample text this word here GeeksF-
- # orGeeks is half on first line, remaining on next.
- # To remove this, we replace every '-\n' to ''.
- text = text.replace("-\n", "")
-
- breakpoint()
-
- h.write(text)
-
-db = TinyDB('orders.json')
-entries = db.all()
-
-for entry in entries:
- to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt')