from time import sleep import os import uuid from urllib import request from selenium.webdriver import Firefox from selenium.webdriver.common.by import By from selenium.webdriver.support.select import Select from bs4 import BeautifulSoup import cv2 import pytesseract import tempfile Karnataka = '3' Bengaluru = '20' CMM_Court_Complex = '1030134@2,5,10,11,12,13,14@Y' Chief_Metropolitan = '10' ACT = '23' class Scraper: def __init__(self, db): self.db = db self.driver = Firefox() self.driver.get('https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index') self.current_view = {} def run(self): self.close_modal() self.goto_acts() self.select_act() self.handle_table() def close_modal(self): sleep(3) self.driver.execute_script('closeModel({modal_id:"validateError"})') sleep(1) def select(self, i_d, value): sleep(1) element = self.driver.find_element(By.ID, i_d) select = Select(element) select.select_by_value(value) sleep(1) def select_act(self): self.select('actcode', ACT) sleep(1) # Disposed only self.driver.find_element(By.ID, 'radDAct').click() self.submit_search() def goto_acts(self): while True: self.select('sess_state_code', Karnataka) self.select('sess_dist_code', Bengaluru) self.select('court_complex_code', CMM_Court_Complex) sleep(2) if self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed(): self.driver.execute_script('closeModel({modal_id:"validateError"})') continue break self.select('court_est_code', Chief_Metropolitan ) sleep(1) element = self.driver.find_element(By.ID, 'act-tabMenu') element.click() sleep(1) def submit_search(self): captcha_incomplete = True while captcha_incomplete: sleep(2) img = self.driver.find_element(By.ID, 'captcha_image') temp = tempfile.NamedTemporaryFile(suffix='.png') img.screenshot(temp.name) img = cv2.imread(temp.name) text = pytesseract.image_to_string(img).strip() element = self.driver.find_element(By.ID, 'act_captcha_code') element.send_keys(text) self.driver.execute_script('submitAct()') sleep(3) if self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed(): self.driver.execute_script('closeModel({modal_id:"validateError"})') element.clear() else: captcha_incomplete = False def handle_table(self): table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML') self.rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td') self.views = [] i = 5 while i < len(self.rows): view = self.rows[i] self.current_view = { 'case_info': self.rows[i-2].get_text(strip=True), 'petitioner_respondent': ' Vs '.join(self.rows[i-1].get_text(strip=True).split('Vs')), 'htmlfile': '', 'pdfs': [] } script = view.find_all('a')[0].get_attribute_list('onclick')[0] self.driver.execute_script(script) sleep(1) html = str(self.driver.find_element(By.ID, 'CSact').get_attribute('innerHTML')) while True: filename = f"html/{uuid.uuid4().hex}.html" if not os.path.exists(filename): break self.current_view['htmlfile'] = filename with open(filename, "w", encoding="utf-8") as f: f.write(html) self.parse_orders_table() self.db.insert(self.current_view) print(f'INSERTED: {self.current_view}') self.driver.find_element(By.ID, 'main_back_act').click() i += 4 def parse_orders_table(self): try: table_innerhtml = self.driver.find_element(By.CLASS_NAME, 'order_table').get_attribute('innerHTML') except: return rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td') self.orders = [] i = 5 while i < len(rows): self.orders.append(rows[i]) i += 3 self.handle_orders() def handle_orders(self): for order in self.orders: script = order.find_all('a')[0].get_attribute_list('onclick')[0] self.driver.execute_script(script) sleep(2) obj = self.driver.find_element(By.TAG_NAME, 'object') pdf_url = str(obj.get_attribute('data')) while True: filename = f"pdf/{uuid.uuid4().hex}.pdf" if not os.path.exists(filename): break self.current_view['pdfs'].append(filename) cookies = "; ".join([f"{c['name']}={c['value']}" for c in self.driver.get_cookies()]) r = request.Request(pdf_url) r.add_header("Cookie", cookies) try: with request.urlopen(r) as response, open(filename, "wb") as file: file.write(response.read()) except: print(f'UNABLE TO FETCH PDF: {pdf_url}') self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()