diff options
| author | Raghuram Subramani <raghus2247@gmail.com> | 2025-03-31 14:30:38 +0530 |
|---|---|---|
| committer | Raghuram Subramani <raghus2247@gmail.com> | 2025-03-31 14:30:57 +0530 |
| commit | 0f188ea1e638e6abddb03d49b9209c703081b2fe (patch) | |
| tree | cfe69bb82158fccf9eb4d5737d0c9c1603c5e1f1 | |
| parent | 97d1df0cd10f9f4adc1991cc8067cc8f1d3978cf (diff) | |
update
| -rw-r--r-- | flake.nix | 46 | ||||
| -rw-r--r-- | scrape_ecourtindia_v6/.gitignore | 6 | ||||
| -rw-r--r-- | scrape_ecourtindia_v6/modules/scraper.py | 13 | ||||
| -rw-r--r-- | scrape_ecourtindia_v6/modules/scraper_case_status.py | 58 | ||||
| -rw-r--r-- | scrape_ecourtindia_v6/results/scraping_results.csv | 1 | ||||
| -rw-r--r-- | scrape_ecourtindia_v6/scrape_case_status.py | 134 | ||||
| -rw-r--r-- | scrape_ecourtindia_v6/scrape_case_status_states.py | 70 | ||||
| -rw-r--r-- | scrape_ecourtindia_v6/translate_to_english.py | 42 | ||||
| -rw-r--r-- | test/.gitignore | 2 | ||||
| -rw-r--r-- | test/transcribe.py | 14 |
10 files changed, 252 insertions, 134 deletions
@@ -2,27 +2,33 @@ inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable"; outputs = { self, nixpkgs, ... }: let - pkgs = import nixpkgs { system = "x86_64-linux"; config.allowUnfree = true; }; - in { - devShells.x86_64-linux.default = pkgs.mkShell { - buildInputs = with pkgs; [ - (python3.withPackages (p: [ - p.selenium - p.opencv-python - p.pytesseract - p.beautifulsoup4 - p.tinydb - p.fastapi - p.uvicorn - p.jinja2 - ])) - pyright + system = "x86_64-linux"; + pkgs = import nixpkgs { inherit system; config.allowUnfree = true; }; + in { + devShells.${system}.default = pkgs.mkShell { + buildInputs = with pkgs; [ + (python3.withPackages (p: [ + p.selenium + p.opencv-python + p.pytesseract + p.beautifulsoup4 + p.tinydb + p.fastapi + p.uvicorn + p.jinja2 - firefox - geckodriver + # p.pdf2image + # p.openai-whisper + # p.torch-bin + ])) - tesseract - ]; - }; + pyright + + firefox + geckodriver + + tesseract + ]; }; + }; } diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore index f32422f..1aed0d4 100644 --- a/scrape_ecourtindia_v6/.gitignore +++ b/scrape_ecourtindia_v6/.gitignore @@ -1,6 +1,8 @@ -courts.csv +*.csv csv/* named_pdf/* pdf/* html/* -orders.json +bak/ +translated/* +*.json diff --git a/scrape_ecourtindia_v6/modules/scraper.py b/scrape_ecourtindia_v6/modules/scraper.py index 4616763..140302e 100644 --- a/scrape_ecourtindia_v6/modules/scraper.py +++ b/scrape_ecourtindia_v6/modules/scraper.py @@ -20,8 +20,14 @@ class Scraper: sleep(1) def select(self, i_d, value): - sleep(1) - element = self.driver.find_element(By.ID, i_d) + while True: + try: + element = self.driver.find_element(By.ID, i_d) + break + except: + sleep(0.2) + pass + select = Select(element) select.select_by_visible_text(value) sleep(1) @@ -52,6 +58,9 @@ class Scraper: return complexes + def establishments_visible(self): + return self.driver.find_element(By.ID, 'court_est_code').is_displayed() + def scrape_establishments(self): element = self.driver.find_element(By.ID, 'court_est_code') options = Select(element).options diff --git a/scrape_ecourtindia_v6/modules/scraper_case_status.py b/scrape_ecourtindia_v6/modules/scraper_case_status.py index 684d9d7..b4a9ec3 100644 --- a/scrape_ecourtindia_v6/modules/scraper_case_status.py +++ b/scrape_ecourtindia_v6/modules/scraper_case_status.py @@ -5,7 +5,6 @@ import uuid from urllib import request from selenium.webdriver.common.by import By -from selenium.webdriver.support.select import Select from bs4 import BeautifulSoup @@ -13,45 +12,30 @@ import cv2 import pytesseract import tempfile -from tinydb import TinyDB - from .scraper import Scraper class ScraperCaseStatus(Scraper): - def __init__(self, config): - Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index') - - self.db = TinyDB('db.json') - self.config = config + def __init__(self): + Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index', headless=False) - def select_act(self): - self.select('actcode', self.config['act']) + def select_act(self, act): + self.select('actcode', act) sleep(1) # Disposed only self.driver.find_element(By.ID, 'radDAct').click() self.submit_search() - def select_court(self): - sleep(2) + def goto_acts(self): while True: - self.select('sess_state_code', self.config['state']) - self.select('sess_dist_code', self.config['district']) - self.select('court_complex_code', self.config['court_complex']) - - sleep(2) - modal_is_open = self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed() - if modal_is_open: + try: self.close_modal() - continue - - break - - self.select('court_est_code', self.config['court_establishment']) + element = self.driver.find_element(By.ID, 'act-tabMenu') + element.click() + break + except: + pass - def goto_acts(self): - element = self.driver.find_element(By.ID, 'act-tabMenu') - element.click() sleep(1) def submit_search(self): @@ -77,8 +61,12 @@ class ScraperCaseStatus(Scraper): else: captcha_incomplete = False - def handle_table(self): - table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML') + def handle_table(self, db): + try: + table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML') + except: + return + self.rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td') self.views = [] i = 5 @@ -109,7 +97,7 @@ class ScraperCaseStatus(Scraper): self.parse_orders_table() - self.db.insert(self.current_view) + db.insert(self.current_view) print(f'INSERTED: {self.current_view}') self.driver.find_element(By.ID, 'main_back_act').click() i += 4 @@ -134,7 +122,7 @@ class ScraperCaseStatus(Scraper): script = order.find_all('a')[0].get_attribute_list('onclick')[0] self.driver.execute_script(script) - sleep(0.7) + sleep(1) obj = self.driver.find_element(By.TAG_NAME, 'object') pdf_url = str(obj.get_attribute('data')) @@ -153,4 +141,10 @@ class ScraperCaseStatus(Scraper): except: print(f'UNABLE TO FETCH PDF: {pdf_url}') - self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click() + sleep(1) + while True: + try: + self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click() + break + except: + pass diff --git a/scrape_ecourtindia_v6/results/scraping_results.csv b/scrape_ecourtindia_v6/results/scraping_results.csv new file mode 100644 index 0000000..35dff1a --- /dev/null +++ b/scrape_ecourtindia_v6/results/scraping_results.csv @@ -0,0 +1 @@ +State,District,Complex,Establishment,Records
diff --git a/scrape_ecourtindia_v6/scrape_case_status.py b/scrape_ecourtindia_v6/scrape_case_status.py index 2b543ba..a8891fd 100644 --- a/scrape_ecourtindia_v6/scrape_case_status.py +++ b/scrape_ecourtindia_v6/scrape_case_status.py @@ -1,89 +1,67 @@ -import csv +from time import sleep from modules.scraper_case_status import ScraperCaseStatus -from concurrent.futures import ThreadPoolExecutor, as_completed -import threading +from tinydb import TinyDB -SCRAPE_ESTABLISHMENTS = True +db = TinyDB('db.json') -class ThreadSafeCSVWriter: - def __init__(self, filename): - self.file = open(filename, 'w', newline='') - self.writer = csv.writer(self.file) - self.lock = threading.Lock() +scraper = ScraperCaseStatus() - def writerow(self, row): - with self.lock: - self.writer.writerow(row) +state = 'Karnataka' +act = 'Juvenile Justice (Care and Protection of Children) Act, 2015' - def close(self): - self.file.close() +scraper.close_modal() +scraper.select('sess_state_code', state) +sleep(1) -def scrape_state_thread(state, config, csv_writer): - scraper = ScraperCaseStatus(config) - scraper.close_modal() - try: - scraper.select('sess_state_code', state) - for district in scraper.scrape_districts(): +for district in scraper.scrape_districts(): + print(f'SELECTING DISTRICT {district}') + while True: + try: + scraper.close_modal() scraper.select('sess_dist_code', district) - for cmplx in scraper.scrape_complexes(): + break + except: + pass + sleep(1) + + for cmplx in scraper.scrape_complexes(): + sleep(1) + print(f'SELECTING COMPLEX {cmplx}') + while True: + try: + scraper.close_modal() scraper.select('court_complex_code', cmplx) - if SCRAPE_ESTABLISHMENTS: - establishments = [] - for establishment in scraper.scrape_establishments(): - establishments.append(establishment) - - csv_writer.writerow([ state, district, cmplx ] + establishments) - else: - csv_writer.writerow([ state, district, cmplx ]) - except Exception as e: - print(f"Error scraping {state}: {e}") - finally: - scraper.driver.quit() - -def scrape_courts(): - config = {} - - m = ScraperCaseStatus(config) - m.close_modal() - - csv_writer = ThreadSafeCSVWriter('csv/courts.csv') - csv_writer.writerow(['State', 'District', 'Complex']) - - states = m.scrape_states() - m.driver.close() - - with ThreadPoolExecutor(max_workers=5) as executor: - futures = [ - executor.submit(scrape_state_thread, state, config, csv_writer) - for state in states - ] - - for future in as_completed(futures): + break + except: + pass + try: + scraper.driver.switch_to.alert.accept(); + scraper.close_modal() + except: + pass + + for establishment in scraper.scrape_establishments(): + sleep(1) + print(f'SELECTING ESTABLISHMENT {establishment}') + while True: + try: + scraper.close_modal() + scraper.select('court_est_code', establishment) + break + except Exception as e: + print("EXCEPTION HANDLED:") + print(e) + + sleep(1) + scraper.close_modal() + + sleep(1) + scraper.goto_acts() try: - future.result() + scraper.select_act(act) + scraper.handle_table(db) except Exception as e: - print(f"A thread encountered an error: {e}") - - csv_writer.close() - -def scrape_orders(): - config = {} - - m = ScraperCaseStatus(config) - m.close_modal() - - config['state'] = input('Select a state: ') - config['district'] = input('Select a district: ') - config['court_complex'] = input('Select a court complex: ') - config['court_establishment'] = input('Select a court establishment: ') - config['act'] = input('Select an act: ') - - m.select_court() - m.goto_acts() - m.select_act() - m.handle_table() - - m.driver.close() + print("EXCEPTION HANDLED:") + print(e) -if __name__ == '__main__': - scrape_courts() +scraper.driver.close() diff --git a/scrape_ecourtindia_v6/scrape_case_status_states.py b/scrape_ecourtindia_v6/scrape_case_status_states.py new file mode 100644 index 0000000..e75af84 --- /dev/null +++ b/scrape_ecourtindia_v6/scrape_case_status_states.py @@ -0,0 +1,70 @@ +import csv +from modules.scraper_case_status import ScraperCaseStatus +from concurrent.futures import ThreadPoolExecutor, as_completed +import threading + +SCRAPE_ESTABLISHMENTS = True + +class ThreadSafeCSVWriter: + def __init__(self, filename): + self.file = open(filename, 'w', newline='') + self.writer = csv.writer(self.file) + self.lock = threading.Lock() + + def writerow(self, row): + with self.lock: + self.writer.writerow(row) + + def close(self): + self.file.close() + +def scrape_state_thread(state, config, csv_writer): + scraper = ScraperCaseStatus(config) + scraper.close_modal() + try: + scraper.select('sess_state_code', state) + for district in scraper.scrape_districts(): + scraper.select('sess_dist_code', district) + for cmplx in scraper.scrape_complexes(): + scraper.select('court_complex_code', cmplx) + if SCRAPE_ESTABLISHMENTS: + establishments = [] + for establishment in scraper.scrape_establishments(): + establishments.append(establishment) + + csv_writer.writerow([ state, district, cmplx ] + establishments) + else: + csv_writer.writerow([ state, district, cmplx ]) + except Exception as e: + print(f"Error scraping {state}: {e}") + finally: + scraper.driver.quit() + +def scrape_courts(): + config = {} + + m = ScraperCaseStatus(config) + m.close_modal() + + csv_writer = ThreadSafeCSVWriter('csv/courts.csv') + csv_writer.writerow(['State', 'District', 'Complex']) + + states = m.scrape_states() + m.driver.close() + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [ + executor.submit(scrape_state_thread, state, config, csv_writer) + for state in states + ] + + for future in as_completed(futures): + try: + future.result() + except Exception as e: + print(f"A thread encountered an error: {e}") + + csv_writer.close() + +if __name__ == '__main__': + scrape_courts() diff --git a/scrape_ecourtindia_v6/translate_to_english.py b/scrape_ecourtindia_v6/translate_to_english.py new file mode 100644 index 0000000..485a4b8 --- /dev/null +++ b/scrape_ecourtindia_v6/translate_to_english.py @@ -0,0 +1,42 @@ +from tempfile import TemporaryDirectory + +import pytesseract +from pdf2image import convert_from_path +from PIL import Image + +from tinydb import TinyDB + +language = 'hin' + +def to_english(input_file, output_file): + image_file_list = [] + + with TemporaryDirectory() as tempdir: + pdf_pages = convert_from_path(input_file, 500) + + for page_enumeration, page in enumerate(pdf_pages, start=1): + filename = f"{tempdir}/page_{page_enumeration}.jpg" + page.save(filename, "JPEG") + image_file_list.append(filename) + + with open(output_file, "a") as h: + for image_file in image_file_list: + text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language)))) + + # In many PDFs, at line ending, if a word can't + # be written fully, a 'hyphen' is added. + # The rest of the word is written in the next line + # Eg: This is a sample text this word here GeeksF- + # orGeeks is half on first line, remaining on next. + # To remove this, we replace every '-\n' to ''. + text = text.replace("-\n", "") + + breakpoint() + + h.write(text) + +db = TinyDB('orders.json') +entries = db.all() + +for entry in entries: + to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt') diff --git a/test/.gitignore b/test/.gitignore new file mode 100644 index 0000000..818a333 --- /dev/null +++ b/test/.gitignore @@ -0,0 +1,2 @@ +*.txt +*.mp3 diff --git a/test/transcribe.py b/test/transcribe.py new file mode 100644 index 0000000..c64f425 --- /dev/null +++ b/test/transcribe.py @@ -0,0 +1,14 @@ +import os +import whisper + +def transcribe_audio(audio_file_path, model_path): + model = whisper.load_model(model_path) + result = model.transcribe(audio_file_path) + text_file_path = os.path.splitext(audio_file_path)[0] + ".txt" + with open(text_file_path, "w") as text_file: + text_file.write(result['text']) + +audio_file_path = 'test.mp3' + +if audio_file_path is not None: + transcribe_audio(audio_file_path, model_path='medium') |
