from tempfile import TemporaryDirectory
 
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

from tinydb import TinyDB
 
language = 'hin'
 
def to_english(input_file, output_file):
    image_file_list = []

    with TemporaryDirectory() as tempdir:
        pdf_pages = convert_from_path(input_file, 500)

        for page_enumeration, page in enumerate(pdf_pages, start=1):
            filename = f"{tempdir}/page_{page_enumeration}.jpg"
            page.save(filename, "JPEG")
            image_file_list.append(filename)
 
        with open(output_file, "a") as h:
            for image_file in image_file_list:
                text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language))))
 
                # In many PDFs, at line ending, if a word can't
                # be written fully, a 'hyphen' is added.
                # The rest of the word is written in the next line
                # Eg: This is a sample text this word here GeeksF-
                # orGeeks is half on first line, remaining on next.
                # To remove this, we replace every '-\n' to ''.
                text = text.replace("-\n", "")

                breakpoint()
 
                h.write(text)

db = TinyDB('orders.json')
entries = db.all()

for entry in entries:
    to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt')