aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRaghuram Subramani <raghus2247@gmail.com>2025-03-26 22:06:32 +0530
committerRaghuram Subramani <raghus2247@gmail.com>2025-03-26 22:06:32 +0530
commitef63d21480f1f83a660902da3f9ad2d5606b37c2 (patch)
tree322b1d1e8da88a62e1cfd4b0c767f53d3460203d
parent24b38a94e36794e33a1a432ef00eaf0c46957124 (diff)
multi-threaded, headless scraper
-rw-r--r--scrape_ecourtindia_v6/.gitignore1
-rwxr-xr-xscrape_ecourtindia_v6/clean.sh3
-rw-r--r--scrape_ecourtindia_v6/main.py70
-rw-r--r--scrape_ecourtindia_v6/scraper.py51
4 files changed, 110 insertions, 15 deletions
diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore
new file mode 100644
index 0000000..ef1949c
--- /dev/null
+++ b/scrape_ecourtindia_v6/.gitignore
@@ -0,0 +1 @@
+courts.csv
diff --git a/scrape_ecourtindia_v6/clean.sh b/scrape_ecourtindia_v6/clean.sh
index bda1361..8c8a0ab 100755
--- a/scrape_ecourtindia_v6/clean.sh
+++ b/scrape_ecourtindia_v6/clean.sh
@@ -1,5 +1,4 @@
#!/usr/bin/env bash
rm -r html/* pdf/* db.json
-mkdir html
-mkdir pdf
+mkdir -p html pdf
diff --git a/scrape_ecourtindia_v6/main.py b/scrape_ecourtindia_v6/main.py
index c81d0b6..1cadad2 100644
--- a/scrape_ecourtindia_v6/main.py
+++ b/scrape_ecourtindia_v6/main.py
@@ -1,18 +1,80 @@
+import csv
from scraper import Scraper
from tinydb import TinyDB
-import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
db = TinyDB('db.json')
-if __name__ == '__main__':
+class ThreadSafeCSVWriter:
+ def __init__(self, filename):
+ self.file = open(filename, 'w', newline='')
+ self.writer = csv.writer(self.file)
+ self.lock = threading.Lock()
+
+ def writerow(self, row):
+ with self.lock:
+ self.writer.writerow(row)
+
+ def close(self):
+ self.file.close()
+
+def scrape_state_thread(state, config, csv_writer):
+ scraper = Scraper(db, config)
+ scraper.close_modal()
+ try:
+ for district in scraper.scrape_districts(state):
+ for cmplx in scraper.scrape_complexes(state, district):
+ csv_writer.writerow([state, district, cmplx])
+ except Exception as e:
+ print(f"Error scraping {state}: {e}")
+ finally:
+ scraper.driver.quit()
+
+def scrape_courts():
config = {}
+ m = Scraper(db, config)
+ m.close_modal()
+
+ csv_writer = ThreadSafeCSVWriter('courts.csv')
+ csv_writer.writerow(['State', 'District', 'Complex'])
+
+ states = m.scrape_states()
+ m.driver.close()
+
+ with ThreadPoolExecutor(max_workers=5) as executor:
+ futures = [
+ executor.submit(scrape_state_thread, state, config, csv_writer)
+ for state in states
+ ]
+
+ for future in as_completed(futures):
+ try:
+ future.result()
+ except Exception as e:
+ print(f"A thread encountered an error: {e}")
+
+ csv_writer.close()
+
+def scrape_orders():
+ config = {}
+
+ m = Scraper(db, config)
+ m.close_modal()
+
config['state'] = input('Select a state: ')
config['district'] = input('Select a district: ')
config['court_complex'] = input('Select a court complex: ')
config['court_establishment'] = input('Select a court establishment: ')
config['act'] = input('Select an act: ')
- m = Scraper(db, config)
- m.run()
+ m.select_court()
+ m.goto_acts()
+ m.select_act()
+ m.handle_table()
+
m.driver.close()
+
+if __name__ == '__main__':
+ scrape_courts()
diff --git a/scrape_ecourtindia_v6/scraper.py b/scrape_ecourtindia_v6/scraper.py
index 69d3336..cdab2fd 100644
--- a/scrape_ecourtindia_v6/scraper.py
+++ b/scrape_ecourtindia_v6/scraper.py
@@ -6,6 +6,7 @@ from urllib import request
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
+from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
@@ -19,17 +20,14 @@ class Scraper:
self.db = db
self.config = config
- self.driver = Firefox()
+ options = Options()
+ options.add_argument("--headless")
+
+ self.driver = Firefox(options=options)
self.driver.get('https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index')
self.current_view = {}
- def run(self):
- self.close_modal()
- self.goto_acts()
- self.select_act()
- self.handle_table()
-
def close_modal(self):
sleep(3)
self.driver.execute_script('closeModel({modal_id:"validateError"})')
@@ -50,7 +48,42 @@ class Scraper:
self.driver.find_element(By.ID, 'radDAct').click()
self.submit_search()
- def goto_acts(self):
+ def scrape_states(self):
+ element = self.driver.find_element(By.ID, 'sess_state_code')
+ options = Select(element).options
+ states = [ option.text for option in options[1:] ]
+ print(f'STATES: {states}')
+
+ sleep(0.2)
+
+ return states
+
+ def scrape_districts(self, state):
+ self.select('sess_state_code', state)
+ sleep(0.2)
+
+ element = self.driver.find_element(By.ID, 'sess_dist_code')
+ options = Select(element).options
+ districts = [ option.text for option in options[1:] ]
+ print(f'DISTRICTS: {districts}')
+
+ return districts
+
+ def scrape_complexes(self, state, district):
+ self.select('sess_state_code', state)
+ sleep(0.2)
+ self.select('sess_dist_code', district)
+ sleep(0.2)
+
+ element = self.driver.find_element(By.ID, 'court_complex_code')
+ options = Select(element).options
+ complexes = [ option.text for option in options[1:] ]
+ print(f'COMPLEXES: {complexes}')
+
+ return complexes
+
+ def select_court(self):
+ sleep(2)
while True:
self.select('sess_state_code', self.config['state'])
self.select('sess_dist_code', self.config['district'])
@@ -66,7 +99,7 @@ class Scraper:
self.select('court_est_code', self.config['court_establishment'])
- sleep(1)
+ def goto_acts(self):
element = self.driver.find_element(By.ID, 'act-tabMenu')
element.click()
sleep(1)