aboutsummaryrefslogtreecommitdiff
path: root/scrape_ecourtindia_v6/scraper.py
diff options
context:
space:
mode:
authorRaghuram Subramani <raghus2247@gmail.com>2025-03-24 17:11:41 +0530
committerRaghuram Subramani <raghus2247@gmail.com>2025-03-24 17:11:41 +0530
commit434252fa1831465b36e32206684e78cd698e8462 (patch)
tree712d1601d7cfb6991a343e1015a07b8931cdf896 /scrape_ecourtindia_v6/scraper.py
parent33a320d48dddb44ec7d838ae9fdeaa44fabba342 (diff)
upload scrape_ecourtindia_v6
Diffstat (limited to 'scrape_ecourtindia_v6/scraper.py')
-rw-r--r--scrape_ecourtindia_v6/scraper.py166
1 files changed, 166 insertions, 0 deletions
diff --git a/scrape_ecourtindia_v6/scraper.py b/scrape_ecourtindia_v6/scraper.py
new file mode 100644
index 0000000..ebe559c
--- /dev/null
+++ b/scrape_ecourtindia_v6/scraper.py
@@ -0,0 +1,166 @@
+from time import sleep
+import os
+import uuid
+
+from urllib import request
+
+from selenium.webdriver import Firefox
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.select import Select
+
+from bs4 import BeautifulSoup
+
+import cv2
+import pytesseract
+import tempfile
+
+Karnataka = '3'
+Bengaluru = '20'
+CMM_Court_Complex = '1030134@2,5,10,11,12,13,14@Y'
+Chief_Metropolitan = '10'
+
+ACT = '23'
+
+class Scraper:
+ def __init__(self, db):
+ self.db = db
+
+ self.driver = Firefox()
+ self.driver.get('https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index')
+
+ self.current_view = {}
+
+ def run(self):
+ self.close_modal()
+ self.goto_acts()
+ self.select_act()
+ self.parse_table()
+
+ def close_modal(self):
+ sleep(2)
+ self.driver.execute_script('closeModel({modal_id:"validateError"})')
+ sleep(1)
+
+ def select(self, i_d, value):
+ element = self.driver.find_element(By.ID, i_d)
+ select = Select(element)
+ select.select_by_value(value)
+ sleep(1)
+
+ def select_act(self):
+ self.select('actcode', ACT)
+ sleep(1)
+
+ # Disposed only
+ self.driver.find_element(By.ID, 'radDAct').click()
+ self.submit_search()
+
+ def goto_acts(self):
+ self.select('sess_state_code', Karnataka)
+ self.select('sess_dist_code', Bengaluru)
+ self.select('court_complex_code', CMM_Court_Complex)
+
+ sleep(1)
+ self.select('court_est_code', Chief_Metropolitan )
+ sleep(1)
+ element = self.driver.find_element(By.ID, 'act-tabMenu')
+ element.click()
+ sleep(1)
+
+ def submit_search(self):
+ sleep(2)
+ img = self.driver.find_element(By.ID, 'captcha_image')
+ temp = tempfile.NamedTemporaryFile(suffix='.png')
+ img.screenshot(temp.name)
+
+ img = cv2.imread(temp.name)
+ text = pytesseract.image_to_string(img).strip()
+
+ element = self.driver.find_element(By.ID, 'act_captcha_code')
+ element.send_keys(text)
+
+ self.driver.execute_script('submitAct()')
+ sleep(3)
+
+
+ def parse_table(self):
+ table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
+ rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
+ self.views = []
+ i = 5
+ while i < len(rows):
+ self.views.append(rows[i])
+ self.current_view = {
+ 'case_info': rows[i-2].get_text(strip=True),
+ 'petitioner_respondent': ' Vs '.join(rows[i-1].get_text(strip=True).split('Vs')),
+ 'htmlfile': '',
+ 'pdfs': []
+ }
+
+ i += 4
+
+ def handle_views(self):
+ i = 0
+ for view in self.views:
+ script = view.find_all('a')[0].get_attribute_list('onclick')[0]
+ self.driver.execute_script(script)
+ sleep(1)
+
+ html = str(self.driver.find_element(By.ID, 'CSact').get_attribute('innerHTML'))
+
+ while True:
+ filename = f"html/{uuid.uuid4().hex}.html"
+ if not os.path.exists(filename):
+ break
+
+ self.current_view['htmlfile'] = filename
+ with open(filename, "w", encoding="utf-8") as f:
+ f.write(html)
+
+ self.parse_orders_table()
+
+ self.db.insert(self.current_view)
+ self.driver.find_element(By.ID, 'main_back_act').click()
+
+ i += 1
+ if i == 10:
+ break
+
+
+ def parse_orders_table(self):
+ try:
+ table_innerhtml = self.driver.find_element(By.CLASS_NAME, 'order_table').get_attribute('innerHTML')
+ except:
+ return
+
+ rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
+ self.orders = []
+ i = 5
+ while i < len(rows):
+ self.orders.append(rows[i])
+ i += 3
+
+ self.handle_orders()
+
+ def handle_orders(self):
+ for order in self.orders:
+ script = order.find_all('a')[0].get_attribute_list('onclick')[0]
+ self.driver.execute_script(script)
+
+ sleep(2)
+ obj = self.driver.find_element(By.TAG_NAME, 'object')
+ pdf_url = str(obj.get_attribute('data'))
+
+ while True:
+ filename = f"pdf/{uuid.uuid4().hex}.pdf"
+ if not os.path.exists(filename):
+ break
+ self.current_view['pdfs'].append(filename)
+ cookies = "; ".join([f"{c['name']}={c['value']}" for c in self.driver.get_cookies()])
+ r = request.Request(pdf_url)
+ r.add_header("Cookie", cookies)
+
+ with request.urlopen(r) as response, open(filename, "wb") as file:
+ file.write(response.read())
+
+ self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()