diff options
author | Raghuram Subramani <raghus2247@gmail.com> | 2025-03-27 22:03:30 +0530 |
---|---|---|
committer | Raghuram Subramani <raghus2247@gmail.com> | 2025-03-27 22:03:30 +0530 |
commit | 7195110a466b0ed14de1b8ee4fa8d7bb79626018 (patch) | |
tree | fef0ce165a5fad64bb0b4e7f49ef68a166ce399b | |
parent | f1f43d3448bc879eed55f1e6865c06e646b7eb4a (diff) |
refactor
-rw-r--r-- | scrape_ecourtindia_v6/.gitignore | 1 | ||||
-rw-r--r-- | scrape_ecourtindia_v6/csv/.keep | 0 | ||||
-rw-r--r-- | scrape_ecourtindia_v6/modules/scraper.py | 61 | ||||
-rw-r--r-- | scrape_ecourtindia_v6/modules/scraper_case_status.py (renamed from scrape_ecourtindia_v6/scraper.py) | 79 | ||||
-rw-r--r-- | scrape_ecourtindia_v6/modules/scraper_orders.py | 29 | ||||
-rw-r--r-- | scrape_ecourtindia_v6/scrape_case_status.py (renamed from scrape_ecourtindia_v6/main.py) | 29 | ||||
-rw-r--r-- | scrape_ecourtindia_v6/scrape_orders.py | 100 |
7 files changed, 214 insertions, 85 deletions
diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore index ef1949c..62236f3 100644 --- a/scrape_ecourtindia_v6/.gitignore +++ b/scrape_ecourtindia_v6/.gitignore @@ -1 +1,2 @@ courts.csv +csv/* diff --git a/scrape_ecourtindia_v6/csv/.keep b/scrape_ecourtindia_v6/csv/.keep new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/scrape_ecourtindia_v6/csv/.keep diff --git a/scrape_ecourtindia_v6/modules/scraper.py b/scrape_ecourtindia_v6/modules/scraper.py new file mode 100644 index 0000000..4616763 --- /dev/null +++ b/scrape_ecourtindia_v6/modules/scraper.py @@ -0,0 +1,61 @@ +from time import sleep + +from selenium.webdriver import Firefox +from selenium.webdriver.common.by import By +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.support.select import Select + +class Scraper: + def __init__(self, base_url, headless=True): + options = Options() + if headless: + options.add_argument("--headless") + + self.driver = Firefox(options=options) + self.driver.get(base_url) + + def close_modal(self): + sleep(3) + self.driver.execute_script('closeModel({modal_id:"validateError"})') + sleep(1) + + def select(self, i_d, value): + sleep(1) + element = self.driver.find_element(By.ID, i_d) + select = Select(element) + select.select_by_visible_text(value) + sleep(1) + + def scrape_states(self): + element = self.driver.find_element(By.ID, 'sess_state_code') + options = Select(element).options + states = [ option.text for option in options[1:] ] + print(f'STATES: {states}') + + sleep(0.2) + + return states + + def scrape_districts(self): + element = self.driver.find_element(By.ID, 'sess_dist_code') + options = Select(element).options + districts = [ option.text for option in options[1:] ] + print(f'DISTRICTS: {districts}') + + return districts + + def scrape_complexes(self): + element = self.driver.find_element(By.ID, 'court_complex_code') + options = Select(element).options + complexes = [ option.text for option in options[1:] ] + print(f'COMPLEXES: {complexes}') + + return complexes + + def scrape_establishments(self): + element = self.driver.find_element(By.ID, 'court_est_code') + options = Select(element).options + establishments = [ option.text for option in options[1:] if option.text != '' ] + print(f'ESTABLISHMENTS: {establishments}') + + return establishments diff --git a/scrape_ecourtindia_v6/scraper.py b/scrape_ecourtindia_v6/modules/scraper_case_status.py index 18b519a..d9b925d 100644 --- a/scrape_ecourtindia_v6/scraper.py +++ b/scrape_ecourtindia_v6/modules/scraper_case_status.py @@ -4,9 +4,7 @@ import uuid from urllib import request -from selenium.webdriver import Firefox from selenium.webdriver.common.by import By -from selenium.webdriver.firefox.options import Options from selenium.webdriver.support.select import Select from bs4 import BeautifulSoup @@ -15,30 +13,16 @@ import cv2 import pytesseract import tempfile -class Scraper: - def __init__(self, db, config): - self.db = db - self.config = config - - options = Options() - options.add_argument("--headless") +from tinydb import TinyDB - self.driver = Firefox(options=options) - self.driver.get('https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index') +from .scraper import Scraper - self.current_view = {} +class ScraperCaseStatus(Scraper): + def __init__(self, config): + Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index') - def close_modal(self): - sleep(3) - self.driver.execute_script('closeModel({modal_id:"validateError"})') - sleep(1) - - def select(self, i_d, value): - sleep(1) - element = self.driver.find_element(By.ID, i_d) - select = Select(element) - select.select_by_visible_text(value) - sleep(1) + self.db = TinyDB('db.json') + self.config = config def select_act(self): self.select('actcode', self.config['act']) @@ -48,55 +32,6 @@ class Scraper: self.driver.find_element(By.ID, 'radDAct').click() self.submit_search() - def scrape_states(self): - element = self.driver.find_element(By.ID, 'sess_state_code') - options = Select(element).options - states = [ option.text for option in options[1:] ] - print(f'STATES: {states}') - - sleep(0.2) - - return states - - def scrape_districts(self, state): - self.select('sess_state_code', state) - sleep(0.2) - - element = self.driver.find_element(By.ID, 'sess_dist_code') - options = Select(element).options - districts = [ option.text for option in options[1:] ] - print(f'DISTRICTS: {districts}') - - return districts - - def scrape_complexes(self, state, district): - self.select('sess_state_code', state) - sleep(0.2) - self.select('sess_dist_code', district) - sleep(0.2) - - element = self.driver.find_element(By.ID, 'court_complex_code') - options = Select(element).options - complexes = [ option.text for option in options[1:] ] - print(f'COMPLEXES: {complexes}') - - return complexes - - def scrape_establishments(self, state, district, cmplx): - self.select('sess_state_code', state) - sleep(0.2) - self.select('sess_dist_code', district) - sleep(0.2) - self.select('court_complex_code', cmplx) - sleep(1) - - element = self.driver.find_element(By.ID, 'court_est_code') - options = Select(element).options - establishments = [ option.text for option in options[1:] ] - print(f'ESTABLISHMENTS: {establishments}') - - return establishments - def select_court(self): sleep(2) while True: diff --git a/scrape_ecourtindia_v6/modules/scraper_orders.py b/scrape_ecourtindia_v6/modules/scraper_orders.py new file mode 100644 index 0000000..78594e8 --- /dev/null +++ b/scrape_ecourtindia_v6/modules/scraper_orders.py @@ -0,0 +1,29 @@ +from time import sleep + +from selenium.webdriver.common.by import By + +from selenium.webdriver.support.select import Select +from tinydb import TinyDB + +from .scraper import Scraper + +class ScraperOrders(Scraper): + def __init__(self, config): + Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=courtorder/index') + + self.db = TinyDB('db.json') + self.config = config + + def goto_courtnumber(self): + element = self.driver.find_element(By.ID, 'courtnumber-tabMenu') + element.click() + sleep(1) + + def get_court_numbers(self): + element = self.driver.find_element(By.ID, 'nnjudgecode1') + select = Select(element) + options = select.options + court_numbers = [ option.text for option in options ] + print(f'COURT NUMBERS: {court_numbers}') + + return court_numbers diff --git a/scrape_ecourtindia_v6/main.py b/scrape_ecourtindia_v6/scrape_case_status.py index 9d4c193..2b543ba 100644 --- a/scrape_ecourtindia_v6/main.py +++ b/scrape_ecourtindia_v6/scrape_case_status.py @@ -1,11 +1,8 @@ import csv -from scraper import Scraper -from tinydb import TinyDB +from modules.scraper_case_status import ScraperCaseStatus from concurrent.futures import ThreadPoolExecutor, as_completed import threading -db = TinyDB('db.json') - SCRAPE_ESTABLISHMENTS = True class ThreadSafeCSVWriter: @@ -22,14 +19,20 @@ class ThreadSafeCSVWriter: self.file.close() def scrape_state_thread(state, config, csv_writer): - scraper = Scraper(db, config) + scraper = ScraperCaseStatus(config) scraper.close_modal() try: - for district in scraper.scrape_districts(state): - for cmplx in scraper.scrape_complexes(state, district): + scraper.select('sess_state_code', state) + for district in scraper.scrape_districts(): + scraper.select('sess_dist_code', district) + for cmplx in scraper.scrape_complexes(): + scraper.select('court_complex_code', cmplx) if SCRAPE_ESTABLISHMENTS: - for establishment in scraper.scrape_establishments(state, district, cmplx): - csv_writer.writerow([ state, district, cmplx, establishment ]) + establishments = [] + for establishment in scraper.scrape_establishments(): + establishments.append(establishment) + + csv_writer.writerow([ state, district, cmplx ] + establishments) else: csv_writer.writerow([ state, district, cmplx ]) except Exception as e: @@ -40,16 +43,16 @@ def scrape_state_thread(state, config, csv_writer): def scrape_courts(): config = {} - m = Scraper(db, config) + m = ScraperCaseStatus(config) m.close_modal() - csv_writer = ThreadSafeCSVWriter('courts.csv') + csv_writer = ThreadSafeCSVWriter('csv/courts.csv') csv_writer.writerow(['State', 'District', 'Complex']) states = m.scrape_states() m.driver.close() - with ThreadPoolExecutor(max_workers=10) as executor: + with ThreadPoolExecutor(max_workers=5) as executor: futures = [ executor.submit(scrape_state_thread, state, config, csv_writer) for state in states @@ -66,7 +69,7 @@ def scrape_courts(): def scrape_orders(): config = {} - m = Scraper(db, config) + m = ScraperCaseStatus(config) m.close_modal() config['state'] = input('Select a state: ') diff --git a/scrape_ecourtindia_v6/scrape_orders.py b/scrape_ecourtindia_v6/scrape_orders.py new file mode 100644 index 0000000..74cdd56 --- /dev/null +++ b/scrape_ecourtindia_v6/scrape_orders.py @@ -0,0 +1,100 @@ +import csv +from time import sleep +from modules.scraper_orders import ScraperOrders +from selenium.webdriver.common.by import By +from concurrent.futures import ThreadPoolExecutor, as_completed +import threading + +class ThreadSafeCSVWriter: + def __init__(self, filename): + self.file = open(filename, 'w', newline='') + self.writer = csv.writer(self.file) + self.lock = threading.Lock() + + def writerow(self, row): + with self.lock: + self.writer.writerow(row) + print(f'Wrote: {row}') + + def close(self): + self.file.close() + +def scrape_district(state, district, csv_writer): + try: + config = {} + scraper = ScraperOrders(config) + scraper.close_modal() + + scraper.select('sess_state_code', state) + scraper.select('sess_dist_code', district) + + complexes = scraper.scrape_complexes() + scraper.select('court_complex_code', complexes[0]) + + sleep(2) + scraper.goto_courtnumber() + + for cmplx in complexes: + while True: + sleep(0.5) + try: + modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed() + if modal_is_open: + scraper.close_modal() + continue + break + except: + break + + scraper.select('court_complex_code', cmplx) + sleep(0.5) + + court_numbers = scraper.get_court_numbers() + for court_number in court_numbers: + row = [state, district, cmplx, court_number] + csv_writer.writerow(row) + + scraper.driver.quit() + + except Exception as e: + print(f"Error scraping district {district}: {e}") + +def scrape_courts(): + state = 'Uttar Pradesh' + + config = {} + scraper = ScraperOrders(config) + scraper.close_modal() + scraper.select('sess_state_code', state) + + districts = scraper.scrape_districts() + scraper.driver.quit() + + csv_writer = ThreadSafeCSVWriter('csv/court_numbers.csv') + csv_writer.writerow(['State', 'District', 'Cmplx', 'Court number']) + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [ + executor.submit(scrape_district, state, district, csv_writer) + for district in districts + ] + + for future in as_completed(futures): + try: + future.result() + except Exception as e: + print(f"A thread encountered an error: {e}") + + csv_writer.close() + +def scrape_orders(courts): + csvfile = open(courts, newline='') + reader = csv.reader(csvfile) + + for row in reader: + print(row) + + csvfile.close() + +if __name__ == '__main__': + scrape_orders('csv/2023-24_pocso.csv') |