diff options
| author | Raghuram Subramani <raghus2247@gmail.com> | 2025-04-22 20:54:47 +0530 | 
|---|---|---|
| committer | Raghuram Subramani <raghus2247@gmail.com> | 2025-04-22 20:54:47 +0530 | 
| commit | 3ed36b1adb0be6a450afb755e192a7198187e052 (patch) | |
| tree | 635a10c9540bb5e93a4433d53a4279c68802157c /scrape_ecourtindia_v6/transcribe.py | |
| parent | c5d8880d6419e48b5c1450a5c1236576a47d2ac8 (diff) | |
update a few scripts
Diffstat (limited to 'scrape_ecourtindia_v6/transcribe.py')
| -rw-r--r-- | scrape_ecourtindia_v6/transcribe.py | 102 | 
1 files changed, 102 insertions, 0 deletions
| diff --git a/scrape_ecourtindia_v6/transcribe.py b/scrape_ecourtindia_v6/transcribe.py new file mode 100644 index 0000000..80f5094 --- /dev/null +++ b/scrape_ecourtindia_v6/transcribe.py @@ -0,0 +1,102 @@ +import os +import easyocr +import shutil +import csv +from pdf2image import convert_from_path +# import pytesseract +from concurrent.futures import ThreadPoolExecutor, as_completed + +def read_csv_filenames(csv_path): +    filenames = set() +    with open(csv_path, newline='', encoding='utf-8') as csvfile: +        reader = csv.reader(csvfile) +        for row in reader: +            if len(row) >= 4: +                filename = row[4].strip() +                if filename.lower().endswith('.pdf'): +                    filenames.add(filename) +    return filenames + +def process_pdf(pdf_path, output_folder, dpi=300, lang='hi'): +    reader = easyocr.Reader(['hi'], gpu=True)  # 'hi' is for Hindi +    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] +    pdf_output_dir = os.path.join(output_folder, pdf_name) +    images_dir = os.path.join(pdf_output_dir, "images") + +    os.makedirs(images_dir, exist_ok=True) + +    try: +        images = convert_from_path(pdf_path, dpi=dpi) +        ocr_texts = [] + +        for i, image in enumerate(images): +            image_path = os.path.join(images_dir, f"page_{i+1}.png") +            image.save(image_path, "PNG") + +            # GPU-accelerated OCR +            result = reader.readtext(image_path, detail=0) +            text = "\n".join(result) + +            ocr_texts.append(f"--- Page {i+1} ---\n{text.strip()}\n") + +        ocr_output_path = os.path.join(pdf_output_dir, "ocr_output.txt") +        with open(ocr_output_path, "w", encoding="utf-8") as f: +            f.write("\n".join(ocr_texts)) + +        print(f"✅ Processed with GPU: {pdf_path} → {ocr_output_path}") +    except Exception as e: +        print(f"❌ Error processing {pdf_path}: {e}") + +def collect_txt_files(base_output_folder, destination_folder): +    os.makedirs(destination_folder, exist_ok=True) +    for root, dirs, files in os.walk(base_output_folder): +        for file in files: +            if file == "ocr_output.txt": +                full_path = os.path.join(root, file) +                new_name = os.path.basename(os.path.dirname(full_path)) + ".txt" +                dest_path = os.path.join(destination_folder, new_name) +                shutil.copy(full_path, dest_path) +                print(f"📁 Copied: {full_path} → {dest_path}") + +def batch_process_folder(input_folder, output_folder, csv_path, dpi=300, lang='hi', max_threads=32): +    os.makedirs(output_folder, exist_ok=True) + +    # Read allowed filenames from the CSV +    valid_filenames = read_csv_filenames(csv_path) + +    # Only include matching PDF files +    pdf_files = [ +        os.path.join(input_folder, filename) +        for filename in os.listdir(input_folder) +        if filename in valid_filenames +    ] + +    print(f'number_of_files: {len(pdf_files)}') + +    if not pdf_files: +        print("⚠️ No matching PDF files found in input folder.") +        return + +    with ThreadPoolExecutor(max_workers=max_threads) as executor: +        futures = { +            executor.submit(process_pdf, pdf_path, output_folder, dpi, lang): pdf_path +            for pdf_path in pdf_files +        } + +        for future in as_completed(futures): +            pdf_path = futures[future] +            try: +                future.result() +            except Exception as e: +                print(f"⚠️ Failed to process {pdf_path}: {e}") + +    # collect_txt_files(output_folder, os.path.join(output_folder, "all_texts")) + +# Set your actual folders and CSV path +input_folder = "pdf" +output_folder = "transcribed" +csv_path = "files.csv" + +# Run batch processing with CSV filtering +# batch_process_folder(input_folder, output_folder, csv_path, lang='hin', max_threads=2) +collect_txt_files(output_folder, os.path.join(output_folder, "all_texts")) | 
