diff options
author | Raghuram Subramani <raghus2247@gmail.com> | 2025-03-27 23:05:13 +0530 |
---|---|---|
committer | Raghuram Subramani <raghus2247@gmail.com> | 2025-03-27 23:05:13 +0530 |
commit | a02c8f4c8643b4b9a531e185813c5d82b6866ec0 (patch) | |
tree | a3cdd49df8412e63ac711c148df6814efa0a05e7 /scrape_ecourtindia_v6/modules/scraper_orders.py | |
parent | 7195110a466b0ed14de1b8ee4fa8d7bb79626018 (diff) |
update
Diffstat (limited to 'scrape_ecourtindia_v6/modules/scraper_orders.py')
-rw-r--r-- | scrape_ecourtindia_v6/modules/scraper_orders.py | 84 |
1 files changed, 79 insertions, 5 deletions
diff --git a/scrape_ecourtindia_v6/modules/scraper_orders.py b/scrape_ecourtindia_v6/modules/scraper_orders.py index 78594e8..a9fe7be 100644 --- a/scrape_ecourtindia_v6/modules/scraper_orders.py +++ b/scrape_ecourtindia_v6/modules/scraper_orders.py @@ -1,17 +1,25 @@ from time import sleep +import tempfile +import uuid +import os + +from urllib import request + +from bs4 import BeautifulSoup + +import cv2 +import pytesseract from selenium.webdriver.common.by import By from selenium.webdriver.support.select import Select -from tinydb import TinyDB - from .scraper import Scraper class ScraperOrders(Scraper): - def __init__(self, config): - Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=courtorder/index') + def __init__(self, db, config): + Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=courtorder/index', headless=True) - self.db = TinyDB('db.json') + self.db = db self.config = config def goto_courtnumber(self): @@ -27,3 +35,69 @@ class ScraperOrders(Scraper): print(f'COURT NUMBERS: {court_numbers}') return court_numbers + + def submit_search(self): + captcha_incomplete = True + while captcha_incomplete: + img = self.driver.find_element(By.ID, 'captcha_image') + temp = tempfile.NamedTemporaryFile(suffix='.png') + img.screenshot(temp.name) + + img = cv2.imread(temp.name) + text = pytesseract.image_to_string(img).strip() + + element = self.driver.find_element(By.ID, 'order_no_captcha_code') + element.send_keys(text) + + self.driver.execute_script('submitCourtNumber()') + sleep(3) + + if self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed(): + self.close_modal() + element.clear() + else: + captcha_incomplete = False + + def parse_orders_table(self): + try: + table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML') + except: + return + + rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td') + self.rows = [] + i = 6 + while i < len(rows): + self.rows.append([ rows[i], rows[i-1].text, rows[i-2].text, rows[i-3].text ]) + i += 5 + + def handle_orders(self): + for row in self.rows: + order = row[0] + + script = order.find_all('a')[0].get_attribute_list('onclick')[0] + self.driver.execute_script(script) + + sleep(0.7) + obj = self.driver.find_elements(By.TAG_NAME, 'object')[-1] + pdf_url = str(obj.get_attribute('data')) + + while True: + filename = f"pdf/{uuid.uuid4().hex}.pdf" + if not os.path.exists(filename): + break + + cookies = "; ".join([f"{c['name']}={c['value']}" for c in self.driver.get_cookies()]) + r = request.Request(pdf_url) + r.add_header("Cookie", cookies) + + try: + with request.urlopen(r) as response, open(filename, "wb") as file: + file.write(response.read()) + except: + print(f'UNABLE TO FETCH PDF: {pdf_url}') + + record = { 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename } + self.db.insert(record) + + self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click() |