aboutsummaryrefslogtreecommitdiff
path: root/scrape_ecourtindia_v6
diff options
context:
space:
mode:
authorRaghuram Subramani <raghus2247@gmail.com>2025-04-22 20:54:47 +0530
committerRaghuram Subramani <raghus2247@gmail.com>2025-04-22 20:54:47 +0530
commit3ed36b1adb0be6a450afb755e192a7198187e052 (patch)
tree635a10c9540bb5e93a4433d53a4279c68802157c /scrape_ecourtindia_v6
parentc5d8880d6419e48b5c1450a5c1236576a47d2ac8 (diff)
update a few scripts
Diffstat (limited to 'scrape_ecourtindia_v6')
-rw-r--r--scrape_ecourtindia_v6/.gitignore2
-rw-r--r--scrape_ecourtindia_v6/create_csv.py4
-rw-r--r--scrape_ecourtindia_v6/create_named_pdfs.py3
-rw-r--r--scrape_ecourtindia_v6/modules/scraper_orders.py5
-rw-r--r--scrape_ecourtindia_v6/scrape_orders.py9
-rw-r--r--scrape_ecourtindia_v6/search_for_words.py109
-rw-r--r--scrape_ecourtindia_v6/transcribe.py102
-rw-r--r--scrape_ecourtindia_v6/translate_to_english.py42
8 files changed, 224 insertions, 52 deletions
diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore
index 1aed0d4..36f0da5 100644
--- a/scrape_ecourtindia_v6/.gitignore
+++ b/scrape_ecourtindia_v6/.gitignore
@@ -6,3 +6,5 @@ html/*
bak/
translated/*
*.json
+transcribed/*
+txt/*
diff --git a/scrape_ecourtindia_v6/create_csv.py b/scrape_ecourtindia_v6/create_csv.py
index 5561b73..1bf8860 100644
--- a/scrape_ecourtindia_v6/create_csv.py
+++ b/scrape_ecourtindia_v6/create_csv.py
@@ -6,10 +6,10 @@ entries = db.all()
csvfile = open('orders.csv', 'w', newline='')
w = csv.writer(csvfile)
-w.writerow(['Court Name', 'Case Info', 'Petitioner/Respondent', 'Date', 'File'])
+w.writerow(['District', 'Court Name', 'Case Info', 'Petitioner/Respondent', 'Date', 'File'])
for entry in entries:
- ent = [entry['court_name'], entry['case_info'], entry['petitioner_respondent'], entry['date'], f'http://aarch.compromyse.xyz:8000/{entry["filename"]}']
+ ent = [entry['district'], entry['court_name'], entry['case_info'], entry['petitioner_respondent'], entry['date'], f'http://aarch.compromyse.xyz:8000/{entry["filename"]}']
w.writerow(ent)
csvfile.close()
diff --git a/scrape_ecourtindia_v6/create_named_pdfs.py b/scrape_ecourtindia_v6/create_named_pdfs.py
index c47c66e..a37fc10 100644
--- a/scrape_ecourtindia_v6/create_named_pdfs.py
+++ b/scrape_ecourtindia_v6/create_named_pdfs.py
@@ -13,11 +13,12 @@ db = TinyDB('orders.json')
entries = db.all()
for entry in entries:
+ district = sanitize_filename(entry['district'])
date = sanitize_filename(entry['date'])
case_info = sanitize_filename(entry['case_info'])
court_name = sanitize_filename(entry['court_name'])
- newname = f"named_pdf/{date}---{case_info}---{court_name}.pdf"
+ newname = f"named_pdf/{district}---{date}---{case_info}---{court_name}.pdf"
try:
shutil.copyfile(entry['filename'], newname)
diff --git a/scrape_ecourtindia_v6/modules/scraper_orders.py b/scrape_ecourtindia_v6/modules/scraper_orders.py
index d0b8df3..0a54a91 100644
--- a/scrape_ecourtindia_v6/modules/scraper_orders.py
+++ b/scrape_ecourtindia_v6/modules/scraper_orders.py
@@ -71,7 +71,7 @@ class ScraperOrders(Scraper):
self.rows.append([ rows[i], rows[i-1].text, rows[i-2].text, rows[i-3].text ])
i += 5
- def handle_orders(self, court_name):
+ def handle_orders(self, court_name, district):
for row in self.rows:
order = row[0]
@@ -97,7 +97,8 @@ class ScraperOrders(Scraper):
except:
print(f'UNABLE TO FETCH PDF: {pdf_url}')
- record = { 'court_name': court_name, 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename }
+ record = { 'district': district, 'court_name': court_name, 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename }
self.db.insert(record)
+ sleep(0.7)
self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
diff --git a/scrape_ecourtindia_v6/scrape_orders.py b/scrape_ecourtindia_v6/scrape_orders.py
index 146119e..e254967 100644
--- a/scrape_ecourtindia_v6/scrape_orders.py
+++ b/scrape_ecourtindia_v6/scrape_orders.py
@@ -51,7 +51,7 @@ def scrape_single_court(row):
scraper.submit_search()
scraper.parse_orders_table()
- scraper.handle_orders(row[3])
+ scraper.handle_orders(row[3], row[1])
scraper.driver.quit()
@@ -63,7 +63,7 @@ def scrape_orders(courts_csv):
reader = csv.reader(csvfile)
courts = list(reader)
- with ThreadPoolExecutor(max_workers=5) as executor:
+ with ThreadPoolExecutor(max_workers=1) as executor:
futures = [
executor.submit(scrape_single_court, court)
for court in courts
@@ -75,6 +75,5 @@ def scrape_orders(courts_csv):
except Exception as e:
print(f"A thread encountered an error: {e}")
-if __name__ == '__main__':
- input_file = 'csv/2023-24_pocso.csv'
- scrape_orders(input_file)
+input_file = 'csv/2023-24_pocso_all_districts.csv'
+scrape_orders(input_file)
diff --git a/scrape_ecourtindia_v6/search_for_words.py b/scrape_ecourtindia_v6/search_for_words.py
new file mode 100644
index 0000000..effcea9
--- /dev/null
+++ b/scrape_ecourtindia_v6/search_for_words.py
@@ -0,0 +1,109 @@
+import os
+import csv
+import re
+import argostranslate.translate
+
+# Load Argos Translate model (assumes it's already installed)
+installed_languages = argostranslate.translate.load_installed_languages()
+hi_lang = next(filter(lambda x: x.code == "hi", installed_languages))
+en_lang = next(filter(lambda x: x.code == "en", installed_languages))
+translator = hi_lang.get_translation(en_lang)
+
+# Hindi phrases to search
+phrases = [
+ "किशोर",
+ "किशोर न्यायालय",
+ "बोर्ड",
+ "प्रारंभिक आकलन",
+ "प्रारंभिक निर्धारण",
+ "बालक"
+]
+
+main_phrases = ["किशोर", "किशोर न्यायालय"]
+
+input_dir = "txt"
+output_csv_hindi = "output_hindi.csv"
+output_csv_english = "output_english.csv"
+base_url = "https://aarch.compromyse.xyz:8000/txt/"
+
+# Extract up to 10 snippets for a phrase
+def extract_snippets(text, phrase, window=10, max_count=10):
+ words = text.split()
+ snippets = []
+ for i, word in enumerate(words):
+ if phrase in word:
+ start = max(0, i - window)
+ end = min(len(words), i + window + 1)
+ snippet = ' '.join(words[start:end])
+ snippets.append(snippet)
+ if len(snippets) >= max_count:
+ break
+ return snippets
+
+# CSV header
+header = ["File", "File URL"]
+for phrase in phrases:
+ header.append(f"{phrase} Present")
+ if phrase in main_phrases:
+ for i in range(1, 11):
+ header.append(f"{phrase} Snippet {i}")
+ else:
+ header.append(f"{phrase} Snippet")
+
+# Process files
+results = []
+for filename in os.listdir(input_dir):
+ if filename.endswith(".txt"):
+ filepath = os.path.join(input_dir, filename)
+ with open(filepath, 'r', encoding='utf-8') as f:
+ text = f.read()
+ file_url = base_url + filename
+ row = [filename, file_url]
+
+ for phrase in phrases:
+ found = phrase in text
+ row.append("Yes" if found else "No")
+
+ if found:
+ snippets = extract_snippets(text, phrase, max_count=10)
+ if phrase in main_phrases:
+ row.extend(snippets + [""] * (10 - len(snippets)))
+ else:
+ row.append(snippets[0] if snippets else "")
+ else:
+ if phrase in main_phrases:
+ row.extend([""] * 10)
+ else:
+ row.append("")
+ results.append(row)
+
+# Write Hindi CSV
+with open(output_csv_hindi, 'w', encoding='utf-8-sig', newline='') as f:
+ writer = csv.writer(f)
+ writer.writerow(header)
+ writer.writerows(results)
+
+# Translate header
+translated_header = [translator.translate(cell) if re.search(r'[\u0900-\u097F]', cell) else cell for cell in header]
+
+# Translate rows
+translated_rows = [translated_header]
+for row in results:
+ translated_row = []
+ for cell in row:
+ try:
+ if re.search(r'[\u0900-\u097F]', cell): # Only translate if Hindi detected
+ translated_row.append(translator.translate(cell))
+ else:
+ translated_row.append(cell)
+ except:
+ translated_row.append(cell)
+ translated_rows.append(translated_row)
+
+# Write English CSV
+with open(output_csv_english, 'w', encoding='utf-8-sig', newline='') as f:
+ writer = csv.writer(f)
+ writer.writerows(translated_rows)
+
+print(f"✅ Hindi CSV saved to: {output_csv_hindi}")
+print(f"✅ English CSV saved to: {output_csv_english}")
diff --git a/scrape_ecourtindia_v6/transcribe.py b/scrape_ecourtindia_v6/transcribe.py
new file mode 100644
index 0000000..80f5094
--- /dev/null
+++ b/scrape_ecourtindia_v6/transcribe.py
@@ -0,0 +1,102 @@
+import os
+import easyocr
+import shutil
+import csv
+from pdf2image import convert_from_path
+# import pytesseract
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+def read_csv_filenames(csv_path):
+ filenames = set()
+ with open(csv_path, newline='', encoding='utf-8') as csvfile:
+ reader = csv.reader(csvfile)
+ for row in reader:
+ if len(row) >= 4:
+ filename = row[4].strip()
+ if filename.lower().endswith('.pdf'):
+ filenames.add(filename)
+ return filenames
+
+def process_pdf(pdf_path, output_folder, dpi=300, lang='hi'):
+ reader = easyocr.Reader(['hi'], gpu=True) # 'hi' is for Hindi
+ pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
+ pdf_output_dir = os.path.join(output_folder, pdf_name)
+ images_dir = os.path.join(pdf_output_dir, "images")
+
+ os.makedirs(images_dir, exist_ok=True)
+
+ try:
+ images = convert_from_path(pdf_path, dpi=dpi)
+ ocr_texts = []
+
+ for i, image in enumerate(images):
+ image_path = os.path.join(images_dir, f"page_{i+1}.png")
+ image.save(image_path, "PNG")
+
+ # GPU-accelerated OCR
+ result = reader.readtext(image_path, detail=0)
+ text = "\n".join(result)
+
+ ocr_texts.append(f"--- Page {i+1} ---\n{text.strip()}\n")
+
+ ocr_output_path = os.path.join(pdf_output_dir, "ocr_output.txt")
+ with open(ocr_output_path, "w", encoding="utf-8") as f:
+ f.write("\n".join(ocr_texts))
+
+ print(f"✅ Processed with GPU: {pdf_path} → {ocr_output_path}")
+ except Exception as e:
+ print(f"❌ Error processing {pdf_path}: {e}")
+
+def collect_txt_files(base_output_folder, destination_folder):
+ os.makedirs(destination_folder, exist_ok=True)
+ for root, dirs, files in os.walk(base_output_folder):
+ for file in files:
+ if file == "ocr_output.txt":
+ full_path = os.path.join(root, file)
+ new_name = os.path.basename(os.path.dirname(full_path)) + ".txt"
+ dest_path = os.path.join(destination_folder, new_name)
+ shutil.copy(full_path, dest_path)
+ print(f"📁 Copied: {full_path} → {dest_path}")
+
+def batch_process_folder(input_folder, output_folder, csv_path, dpi=300, lang='hi', max_threads=32):
+ os.makedirs(output_folder, exist_ok=True)
+
+ # Read allowed filenames from the CSV
+ valid_filenames = read_csv_filenames(csv_path)
+
+ # Only include matching PDF files
+ pdf_files = [
+ os.path.join(input_folder, filename)
+ for filename in os.listdir(input_folder)
+ if filename in valid_filenames
+ ]
+
+ print(f'number_of_files: {len(pdf_files)}')
+
+ if not pdf_files:
+ print("⚠️ No matching PDF files found in input folder.")
+ return
+
+ with ThreadPoolExecutor(max_workers=max_threads) as executor:
+ futures = {
+ executor.submit(process_pdf, pdf_path, output_folder, dpi, lang): pdf_path
+ for pdf_path in pdf_files
+ }
+
+ for future in as_completed(futures):
+ pdf_path = futures[future]
+ try:
+ future.result()
+ except Exception as e:
+ print(f"⚠️ Failed to process {pdf_path}: {e}")
+
+ # collect_txt_files(output_folder, os.path.join(output_folder, "all_texts"))
+
+# Set your actual folders and CSV path
+input_folder = "pdf"
+output_folder = "transcribed"
+csv_path = "files.csv"
+
+# Run batch processing with CSV filtering
+# batch_process_folder(input_folder, output_folder, csv_path, lang='hin', max_threads=2)
+collect_txt_files(output_folder, os.path.join(output_folder, "all_texts"))
diff --git a/scrape_ecourtindia_v6/translate_to_english.py b/scrape_ecourtindia_v6/translate_to_english.py
deleted file mode 100644
index 485a4b8..0000000
--- a/scrape_ecourtindia_v6/translate_to_english.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from tempfile import TemporaryDirectory
-
-import pytesseract
-from pdf2image import convert_from_path
-from PIL import Image
-
-from tinydb import TinyDB
-
-language = 'hin'
-
-def to_english(input_file, output_file):
- image_file_list = []
-
- with TemporaryDirectory() as tempdir:
- pdf_pages = convert_from_path(input_file, 500)
-
- for page_enumeration, page in enumerate(pdf_pages, start=1):
- filename = f"{tempdir}/page_{page_enumeration}.jpg"
- page.save(filename, "JPEG")
- image_file_list.append(filename)
-
- with open(output_file, "a") as h:
- for image_file in image_file_list:
- text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language))))
-
- # In many PDFs, at line ending, if a word can't
- # be written fully, a 'hyphen' is added.
- # The rest of the word is written in the next line
- # Eg: This is a sample text this word here GeeksF-
- # orGeeks is half on first line, remaining on next.
- # To remove this, we replace every '-\n' to ''.
- text = text.replace("-\n", "")
-
- breakpoint()
-
- h.write(text)
-
-db = TinyDB('orders.json')
-entries = db.all()
-
-for entry in entries:
- to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt')