scrape_ecourtindia_v6/translate_to_english.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

from tempfile import TemporaryDirectory
 
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

from tinydb import TinyDB
 
language = 'hin'
 
def to_english(input_file, output_file):
    image_file_list = []

    with TemporaryDirectory() as tempdir:
        pdf_pages = convert_from_path(input_file, 500)

        for page_enumeration, page in enumerate(pdf_pages, start=1):
            filename = f"{tempdir}/page_{page_enumeration}.jpg"
            page.save(filename, "JPEG")
            image_file_list.append(filename)
 
        with open(output_file, "a") as h:
            for image_file in image_file_list:
                text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language))))
 
                # In many PDFs, at line ending, if a word can't
                # be written fully, a 'hyphen' is added.
                # The rest of the word is written in the next line
                # Eg: This is a sample text this word here GeeksF-
                # orGeeks is half on first line, remaining on next.
                # To remove this, we replace every '-\n' to ''.
                text = text.replace("-\n", "")

                breakpoint()
 
                h.write(text)

db = TinyDB('orders.json')
entries = db.all()

for entry in entries:
    to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt')