diff options
author | Raghuram Subramani <raghus2247@gmail.com> | 2025-03-27 23:05:13 +0530 |
---|---|---|
committer | Raghuram Subramani <raghus2247@gmail.com> | 2025-03-27 23:05:13 +0530 |
commit | a02c8f4c8643b4b9a531e185813c5d82b6866ec0 (patch) | |
tree | a3cdd49df8412e63ac711c148df6814efa0a05e7 | |
parent | 7195110a466b0ed14de1b8ee4fa8d7bb79626018 (diff) |
update
-rw-r--r-- | scrape_ecourtindia_v6/modules/scraper_case_status.py | 2 | ||||
-rw-r--r-- | scrape_ecourtindia_v6/modules/scraper_orders.py | 84 | ||||
-rw-r--r-- | scrape_ecourtindia_v6/orders_scrape_courts.py | 130 | ||||
-rw-r--r-- | scrape_ecourtindia_v6/scrape_orders.py | 113 |
4 files changed, 256 insertions, 73 deletions
diff --git a/scrape_ecourtindia_v6/modules/scraper_case_status.py b/scrape_ecourtindia_v6/modules/scraper_case_status.py index d9b925d..684d9d7 100644 --- a/scrape_ecourtindia_v6/modules/scraper_case_status.py +++ b/scrape_ecourtindia_v6/modules/scraper_case_status.py @@ -134,7 +134,7 @@ class ScraperCaseStatus(Scraper): script = order.find_all('a')[0].get_attribute_list('onclick')[0] self.driver.execute_script(script) - sleep(2) + sleep(0.7) obj = self.driver.find_element(By.TAG_NAME, 'object') pdf_url = str(obj.get_attribute('data')) diff --git a/scrape_ecourtindia_v6/modules/scraper_orders.py b/scrape_ecourtindia_v6/modules/scraper_orders.py index 78594e8..a9fe7be 100644 --- a/scrape_ecourtindia_v6/modules/scraper_orders.py +++ b/scrape_ecourtindia_v6/modules/scraper_orders.py @@ -1,17 +1,25 @@ from time import sleep +import tempfile +import uuid +import os + +from urllib import request + +from bs4 import BeautifulSoup + +import cv2 +import pytesseract from selenium.webdriver.common.by import By from selenium.webdriver.support.select import Select -from tinydb import TinyDB - from .scraper import Scraper class ScraperOrders(Scraper): - def __init__(self, config): - Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=courtorder/index') + def __init__(self, db, config): + Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=courtorder/index', headless=True) - self.db = TinyDB('db.json') + self.db = db self.config = config def goto_courtnumber(self): @@ -27,3 +35,69 @@ class ScraperOrders(Scraper): print(f'COURT NUMBERS: {court_numbers}') return court_numbers + + def submit_search(self): + captcha_incomplete = True + while captcha_incomplete: + img = self.driver.find_element(By.ID, 'captcha_image') + temp = tempfile.NamedTemporaryFile(suffix='.png') + img.screenshot(temp.name) + + img = cv2.imread(temp.name) + text = pytesseract.image_to_string(img).strip() + + element = self.driver.find_element(By.ID, 'order_no_captcha_code') + element.send_keys(text) + + self.driver.execute_script('submitCourtNumber()') + sleep(3) + + if self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed(): + self.close_modal() + element.clear() + else: + captcha_incomplete = False + + def parse_orders_table(self): + try: + table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML') + except: + return + + rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td') + self.rows = [] + i = 6 + while i < len(rows): + self.rows.append([ rows[i], rows[i-1].text, rows[i-2].text, rows[i-3].text ]) + i += 5 + + def handle_orders(self): + for row in self.rows: + order = row[0] + + script = order.find_all('a')[0].get_attribute_list('onclick')[0] + self.driver.execute_script(script) + + sleep(0.7) + obj = self.driver.find_elements(By.TAG_NAME, 'object')[-1] + pdf_url = str(obj.get_attribute('data')) + + while True: + filename = f"pdf/{uuid.uuid4().hex}.pdf" + if not os.path.exists(filename): + break + + cookies = "; ".join([f"{c['name']}={c['value']}" for c in self.driver.get_cookies()]) + r = request.Request(pdf_url) + r.add_header("Cookie", cookies) + + try: + with request.urlopen(r) as response, open(filename, "wb") as file: + file.write(response.read()) + except: + print(f'UNABLE TO FETCH PDF: {pdf_url}') + + record = { 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename } + self.db.insert(record) + + self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click() diff --git a/scrape_ecourtindia_v6/orders_scrape_courts.py b/scrape_ecourtindia_v6/orders_scrape_courts.py new file mode 100644 index 0000000..597ce9f --- /dev/null +++ b/scrape_ecourtindia_v6/orders_scrape_courts.py @@ -0,0 +1,130 @@ +import csv +from time import sleep +from modules.scraper_orders import ScraperOrders +from selenium.webdriver.common.by import By +from concurrent.futures import ThreadPoolExecutor, as_completed +import threading + +class ThreadSafeCSVWriter: + def __init__(self, filename): + self.file = open(filename, 'w', newline='') + self.writer = csv.writer(self.file) + self.lock = threading.Lock() + + def writerow(self, row): + with self.lock: + self.writer.writerow(row) + print(f'Wrote: {row}') + + def close(self): + self.file.close() + +def scrape_district(state, district, csv_writer): + try: + config = {} + scraper = ScraperOrders(config) + scraper.close_modal() + + scraper.select('sess_state_code', state) + scraper.select('sess_dist_code', district) + + complexes = scraper.scrape_complexes() + scraper.select('court_complex_code', complexes[0]) + + sleep(2) + scraper.goto_courtnumber() + + for cmplx in complexes: + while True: + sleep(0.5) + try: + modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed() + if modal_is_open: + scraper.close_modal() + continue + break + except: + break + + scraper.select('court_complex_code', cmplx) + sleep(0.5) + + court_numbers = scraper.get_court_numbers() + for court_number in court_numbers: + row = [state, district, cmplx, court_number] + csv_writer.writerow(row) + + scraper.driver.quit() + + except Exception as e: + print(f"Error scraping district {district}: {e}") + +def scrape_courts(): + state = 'Uttar Pradesh' + + config = {} + scraper = ScraperOrders(config) + scraper.close_modal() + scraper.select('sess_state_code', state) + + districts = scraper.scrape_districts() + scraper.driver.quit() + + csv_writer = ThreadSafeCSVWriter('csv/court_numbers.csv') + csv_writer.writerow(['State', 'District', 'Cmplx', 'Court number']) + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [ + executor.submit(scrape_district, state, district, csv_writer) + for district in districts + ] + + for future in as_completed(futures): + try: + future.result() + except Exception as e: + print(f"A thread encountered an error: {e}") + + csv_writer.close() + +def scrape_orders(courts): + csvfile = open(courts, newline='') + reader = csv.reader(csvfile) + + for row in reader: + print(row) + config = {} + scraper = ScraperOrders(config) + scraper.close_modal() + + scraper.select('sess_state_code', row[0]) + scraper.select('sess_dist_code', row[1]) + + while True: + sleep(0.5) + try: + modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed() + if modal_is_open: + scraper.close_modal() + continue + break + except: + break + + scraper.select('court_complex_code', row[2]) + sleep(1) + scraper.goto_courtnumber() + + scraper.select('nnjudgecode1', row[3]) + scraper.driver.find_element(By.ID, 'radBoth2').click() + scraper.submit_search() + + scraper.parse_orders_table() + scraper.handle_orders() + + break + + csvfile.close() + +if __name__ == '__main__': + scrape_orders('csv/2023-24_pocso.csv') diff --git a/scrape_ecourtindia_v6/scrape_orders.py b/scrape_ecourtindia_v6/scrape_orders.py index 74cdd56..54a2d80 100644 --- a/scrape_ecourtindia_v6/scrape_orders.py +++ b/scrape_ecourtindia_v6/scrape_orders.py @@ -1,82 +1,71 @@ import csv from time import sleep + +from tinydb import TinyDB from modules.scraper_orders import ScraperOrders from selenium.webdriver.common.by import By from concurrent.futures import ThreadPoolExecutor, as_completed + import threading -class ThreadSafeCSVWriter: - def __init__(self, filename): - self.file = open(filename, 'w', newline='') - self.writer = csv.writer(self.file) +class ThreadSafeDB: + def __init__(self): + self.db = TinyDB('orders.json') self.lock = threading.Lock() - - def writerow(self, row): + + def insert(self, record): with self.lock: - self.writer.writerow(row) - print(f'Wrote: {row}') + self.db.insert(record) + print(f'INSERTED: {record}') - def close(self): - self.file.close() +db = ThreadSafeDB() -def scrape_district(state, district, csv_writer): +def scrape_single_court(row): try: config = {} - scraper = ScraperOrders(config) + scraper = ScraperOrders(db, config) scraper.close_modal() - scraper.select('sess_state_code', state) - scraper.select('sess_dist_code', district) - - complexes = scraper.scrape_complexes() - scraper.select('court_complex_code', complexes[0]) - - sleep(2) - scraper.goto_courtnumber() - - for cmplx in complexes: - while True: - sleep(0.5) - try: - modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed() - if modal_is_open: - scraper.close_modal() - continue - break - except: - break - - scraper.select('court_complex_code', cmplx) + scraper.select('sess_state_code', row[0]) + scraper.select('sess_dist_code', row[1]) + + while True: sleep(0.5) - - court_numbers = scraper.get_court_numbers() - for court_number in court_numbers: - row = [state, district, cmplx, court_number] - csv_writer.writerow(row) + try: + modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed() + if modal_is_open: + scraper.close_modal() + continue + break + except: + break + + scraper.select('court_complex_code', row[2]) + sleep(1) + scraper.goto_courtnumber() + scraper.select('nnjudgecode1', row[3]) + + scraper.driver.find_element(By.ID, 'radBoth2').click() + + scraper.submit_search() + scraper.parse_orders_table() + scraper.handle_orders() scraper.driver.quit() except Exception as e: - print(f"Error scraping district {district}: {e}") + print(f"Error processing court {row}: {e}") -def scrape_courts(): - state = 'Uttar Pradesh' - - config = {} - scraper = ScraperOrders(config) - scraper.close_modal() - scraper.select('sess_state_code', state) - - districts = scraper.scrape_districts() - scraper.driver.quit() - - csv_writer = ThreadSafeCSVWriter('csv/court_numbers.csv') - csv_writer.writerow(['State', 'District', 'Cmplx', 'Court number']) +def scrape_orders(courts_csv): + with open(courts_csv, newline='') as csvfile: + reader = csv.reader(csvfile) + next(reader, None) + courts = list(reader) with ThreadPoolExecutor(max_workers=5) as executor: futures = [ - executor.submit(scrape_district, state, district, csv_writer) - for district in districts + executor.submit(scrape_single_court, court) + for court in courts ] for future in as_completed(futures): @@ -85,16 +74,6 @@ def scrape_courts(): except Exception as e: print(f"A thread encountered an error: {e}") - csv_writer.close() - -def scrape_orders(courts): - csvfile = open(courts, newline='') - reader = csv.reader(csvfile) - - for row in reader: - print(row) - - csvfile.close() - if __name__ == '__main__': - scrape_orders('csv/2023-24_pocso.csv') + input_file = 'csv/2023-24_pocso.csv' + scrape_orders(input_file) |