aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRaghuram Subramani <raghus2247@gmail.com>2025-03-27 23:05:13 +0530
committerRaghuram Subramani <raghus2247@gmail.com>2025-03-27 23:05:13 +0530
commita02c8f4c8643b4b9a531e185813c5d82b6866ec0 (patch)
treea3cdd49df8412e63ac711c148df6814efa0a05e7
parent7195110a466b0ed14de1b8ee4fa8d7bb79626018 (diff)
update
-rw-r--r--scrape_ecourtindia_v6/modules/scraper_case_status.py2
-rw-r--r--scrape_ecourtindia_v6/modules/scraper_orders.py84
-rw-r--r--scrape_ecourtindia_v6/orders_scrape_courts.py130
-rw-r--r--scrape_ecourtindia_v6/scrape_orders.py113
4 files changed, 256 insertions, 73 deletions
diff --git a/scrape_ecourtindia_v6/modules/scraper_case_status.py b/scrape_ecourtindia_v6/modules/scraper_case_status.py
index d9b925d..684d9d7 100644
--- a/scrape_ecourtindia_v6/modules/scraper_case_status.py
+++ b/scrape_ecourtindia_v6/modules/scraper_case_status.py
@@ -134,7 +134,7 @@ class ScraperCaseStatus(Scraper):
script = order.find_all('a')[0].get_attribute_list('onclick')[0]
self.driver.execute_script(script)
- sleep(2)
+ sleep(0.7)
obj = self.driver.find_element(By.TAG_NAME, 'object')
pdf_url = str(obj.get_attribute('data'))
diff --git a/scrape_ecourtindia_v6/modules/scraper_orders.py b/scrape_ecourtindia_v6/modules/scraper_orders.py
index 78594e8..a9fe7be 100644
--- a/scrape_ecourtindia_v6/modules/scraper_orders.py
+++ b/scrape_ecourtindia_v6/modules/scraper_orders.py
@@ -1,17 +1,25 @@
from time import sleep
+import tempfile
+import uuid
+import os
+
+from urllib import request
+
+from bs4 import BeautifulSoup
+
+import cv2
+import pytesseract
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
-from tinydb import TinyDB
-
from .scraper import Scraper
class ScraperOrders(Scraper):
- def __init__(self, config):
- Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=courtorder/index')
+ def __init__(self, db, config):
+ Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=courtorder/index', headless=True)
- self.db = TinyDB('db.json')
+ self.db = db
self.config = config
def goto_courtnumber(self):
@@ -27,3 +35,69 @@ class ScraperOrders(Scraper):
print(f'COURT NUMBERS: {court_numbers}')
return court_numbers
+
+ def submit_search(self):
+ captcha_incomplete = True
+ while captcha_incomplete:
+ img = self.driver.find_element(By.ID, 'captcha_image')
+ temp = tempfile.NamedTemporaryFile(suffix='.png')
+ img.screenshot(temp.name)
+
+ img = cv2.imread(temp.name)
+ text = pytesseract.image_to_string(img).strip()
+
+ element = self.driver.find_element(By.ID, 'order_no_captcha_code')
+ element.send_keys(text)
+
+ self.driver.execute_script('submitCourtNumber()')
+ sleep(3)
+
+ if self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed():
+ self.close_modal()
+ element.clear()
+ else:
+ captcha_incomplete = False
+
+ def parse_orders_table(self):
+ try:
+ table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
+ except:
+ return
+
+ rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
+ self.rows = []
+ i = 6
+ while i < len(rows):
+ self.rows.append([ rows[i], rows[i-1].text, rows[i-2].text, rows[i-3].text ])
+ i += 5
+
+ def handle_orders(self):
+ for row in self.rows:
+ order = row[0]
+
+ script = order.find_all('a')[0].get_attribute_list('onclick')[0]
+ self.driver.execute_script(script)
+
+ sleep(0.7)
+ obj = self.driver.find_elements(By.TAG_NAME, 'object')[-1]
+ pdf_url = str(obj.get_attribute('data'))
+
+ while True:
+ filename = f"pdf/{uuid.uuid4().hex}.pdf"
+ if not os.path.exists(filename):
+ break
+
+ cookies = "; ".join([f"{c['name']}={c['value']}" for c in self.driver.get_cookies()])
+ r = request.Request(pdf_url)
+ r.add_header("Cookie", cookies)
+
+ try:
+ with request.urlopen(r) as response, open(filename, "wb") as file:
+ file.write(response.read())
+ except:
+ print(f'UNABLE TO FETCH PDF: {pdf_url}')
+
+ record = { 'case_info': row[3], 'petitioner_respondent': row[2], 'date': row[1], 'filename': filename }
+ self.db.insert(record)
+
+ self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
diff --git a/scrape_ecourtindia_v6/orders_scrape_courts.py b/scrape_ecourtindia_v6/orders_scrape_courts.py
new file mode 100644
index 0000000..597ce9f
--- /dev/null
+++ b/scrape_ecourtindia_v6/orders_scrape_courts.py
@@ -0,0 +1,130 @@
+import csv
+from time import sleep
+from modules.scraper_orders import ScraperOrders
+from selenium.webdriver.common.by import By
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+
+class ThreadSafeCSVWriter:
+ def __init__(self, filename):
+ self.file = open(filename, 'w', newline='')
+ self.writer = csv.writer(self.file)
+ self.lock = threading.Lock()
+
+ def writerow(self, row):
+ with self.lock:
+ self.writer.writerow(row)
+ print(f'Wrote: {row}')
+
+ def close(self):
+ self.file.close()
+
+def scrape_district(state, district, csv_writer):
+ try:
+ config = {}
+ scraper = ScraperOrders(config)
+ scraper.close_modal()
+
+ scraper.select('sess_state_code', state)
+ scraper.select('sess_dist_code', district)
+
+ complexes = scraper.scrape_complexes()
+ scraper.select('court_complex_code', complexes[0])
+
+ sleep(2)
+ scraper.goto_courtnumber()
+
+ for cmplx in complexes:
+ while True:
+ sleep(0.5)
+ try:
+ modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed()
+ if modal_is_open:
+ scraper.close_modal()
+ continue
+ break
+ except:
+ break
+
+ scraper.select('court_complex_code', cmplx)
+ sleep(0.5)
+
+ court_numbers = scraper.get_court_numbers()
+ for court_number in court_numbers:
+ row = [state, district, cmplx, court_number]
+ csv_writer.writerow(row)
+
+ scraper.driver.quit()
+
+ except Exception as e:
+ print(f"Error scraping district {district}: {e}")
+
+def scrape_courts():
+ state = 'Uttar Pradesh'
+
+ config = {}
+ scraper = ScraperOrders(config)
+ scraper.close_modal()
+ scraper.select('sess_state_code', state)
+
+ districts = scraper.scrape_districts()
+ scraper.driver.quit()
+
+ csv_writer = ThreadSafeCSVWriter('csv/court_numbers.csv')
+ csv_writer.writerow(['State', 'District', 'Cmplx', 'Court number'])
+
+ with ThreadPoolExecutor(max_workers=5) as executor:
+ futures = [
+ executor.submit(scrape_district, state, district, csv_writer)
+ for district in districts
+ ]
+
+ for future in as_completed(futures):
+ try:
+ future.result()
+ except Exception as e:
+ print(f"A thread encountered an error: {e}")
+
+ csv_writer.close()
+
+def scrape_orders(courts):
+ csvfile = open(courts, newline='')
+ reader = csv.reader(csvfile)
+
+ for row in reader:
+ print(row)
+ config = {}
+ scraper = ScraperOrders(config)
+ scraper.close_modal()
+
+ scraper.select('sess_state_code', row[0])
+ scraper.select('sess_dist_code', row[1])
+
+ while True:
+ sleep(0.5)
+ try:
+ modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed()
+ if modal_is_open:
+ scraper.close_modal()
+ continue
+ break
+ except:
+ break
+
+ scraper.select('court_complex_code', row[2])
+ sleep(1)
+ scraper.goto_courtnumber()
+
+ scraper.select('nnjudgecode1', row[3])
+ scraper.driver.find_element(By.ID, 'radBoth2').click()
+ scraper.submit_search()
+
+ scraper.parse_orders_table()
+ scraper.handle_orders()
+
+ break
+
+ csvfile.close()
+
+if __name__ == '__main__':
+ scrape_orders('csv/2023-24_pocso.csv')
diff --git a/scrape_ecourtindia_v6/scrape_orders.py b/scrape_ecourtindia_v6/scrape_orders.py
index 74cdd56..54a2d80 100644
--- a/scrape_ecourtindia_v6/scrape_orders.py
+++ b/scrape_ecourtindia_v6/scrape_orders.py
@@ -1,82 +1,71 @@
import csv
from time import sleep
+
+from tinydb import TinyDB
from modules.scraper_orders import ScraperOrders
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor, as_completed
+
import threading
-class ThreadSafeCSVWriter:
- def __init__(self, filename):
- self.file = open(filename, 'w', newline='')
- self.writer = csv.writer(self.file)
+class ThreadSafeDB:
+ def __init__(self):
+ self.db = TinyDB('orders.json')
self.lock = threading.Lock()
-
- def writerow(self, row):
+
+ def insert(self, record):
with self.lock:
- self.writer.writerow(row)
- print(f'Wrote: {row}')
+ self.db.insert(record)
+ print(f'INSERTED: {record}')
- def close(self):
- self.file.close()
+db = ThreadSafeDB()
-def scrape_district(state, district, csv_writer):
+def scrape_single_court(row):
try:
config = {}
- scraper = ScraperOrders(config)
+ scraper = ScraperOrders(db, config)
scraper.close_modal()
- scraper.select('sess_state_code', state)
- scraper.select('sess_dist_code', district)
-
- complexes = scraper.scrape_complexes()
- scraper.select('court_complex_code', complexes[0])
-
- sleep(2)
- scraper.goto_courtnumber()
-
- for cmplx in complexes:
- while True:
- sleep(0.5)
- try:
- modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed()
- if modal_is_open:
- scraper.close_modal()
- continue
- break
- except:
- break
-
- scraper.select('court_complex_code', cmplx)
+ scraper.select('sess_state_code', row[0])
+ scraper.select('sess_dist_code', row[1])
+
+ while True:
sleep(0.5)
-
- court_numbers = scraper.get_court_numbers()
- for court_number in court_numbers:
- row = [state, district, cmplx, court_number]
- csv_writer.writerow(row)
+ try:
+ modal_is_open = scraper.driver.find_element(By.CLASS_NAME, 'modal').is_displayed()
+ if modal_is_open:
+ scraper.close_modal()
+ continue
+ break
+ except:
+ break
+
+ scraper.select('court_complex_code', row[2])
+ sleep(1)
+ scraper.goto_courtnumber()
+ scraper.select('nnjudgecode1', row[3])
+
+ scraper.driver.find_element(By.ID, 'radBoth2').click()
+
+ scraper.submit_search()
+ scraper.parse_orders_table()
+ scraper.handle_orders()
scraper.driver.quit()
except Exception as e:
- print(f"Error scraping district {district}: {e}")
+ print(f"Error processing court {row}: {e}")
-def scrape_courts():
- state = 'Uttar Pradesh'
-
- config = {}
- scraper = ScraperOrders(config)
- scraper.close_modal()
- scraper.select('sess_state_code', state)
-
- districts = scraper.scrape_districts()
- scraper.driver.quit()
-
- csv_writer = ThreadSafeCSVWriter('csv/court_numbers.csv')
- csv_writer.writerow(['State', 'District', 'Cmplx', 'Court number'])
+def scrape_orders(courts_csv):
+ with open(courts_csv, newline='') as csvfile:
+ reader = csv.reader(csvfile)
+ next(reader, None)
+ courts = list(reader)
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [
- executor.submit(scrape_district, state, district, csv_writer)
- for district in districts
+ executor.submit(scrape_single_court, court)
+ for court in courts
]
for future in as_completed(futures):
@@ -85,16 +74,6 @@ def scrape_courts():
except Exception as e:
print(f"A thread encountered an error: {e}")
- csv_writer.close()
-
-def scrape_orders(courts):
- csvfile = open(courts, newline='')
- reader = csv.reader(csvfile)
-
- for row in reader:
- print(row)
-
- csvfile.close()
-
if __name__ == '__main__':
- scrape_orders('csv/2023-24_pocso.csv')
+ input_file = 'csv/2023-24_pocso.csv'
+ scrape_orders(input_file)