aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRaghuram Subramani <raghus2247@gmail.com>2025-03-26 22:19:19 +0530
committerRaghuram Subramani <raghus2247@gmail.com>2025-03-26 22:19:19 +0530
commitf1f43d3448bc879eed55f1e6865c06e646b7eb4a (patch)
tree3b67eefca59ffc4bd46b418ec401a3c36b753542
parentef63d21480f1f83a660902da3f9ad2d5606b37c2 (diff)
implement scraping of establishments
-rw-r--r--scrape_ecourtindia_v6/main.py10
-rw-r--r--scrape_ecourtindia_v6/scraper.py15
2 files changed, 23 insertions, 2 deletions
diff --git a/scrape_ecourtindia_v6/main.py b/scrape_ecourtindia_v6/main.py
index 1cadad2..9d4c193 100644
--- a/scrape_ecourtindia_v6/main.py
+++ b/scrape_ecourtindia_v6/main.py
@@ -6,6 +6,8 @@ import threading
db = TinyDB('db.json')
+SCRAPE_ESTABLISHMENTS = True
+
class ThreadSafeCSVWriter:
def __init__(self, filename):
self.file = open(filename, 'w', newline='')
@@ -25,7 +27,11 @@ def scrape_state_thread(state, config, csv_writer):
try:
for district in scraper.scrape_districts(state):
for cmplx in scraper.scrape_complexes(state, district):
- csv_writer.writerow([state, district, cmplx])
+ if SCRAPE_ESTABLISHMENTS:
+ for establishment in scraper.scrape_establishments(state, district, cmplx):
+ csv_writer.writerow([ state, district, cmplx, establishment ])
+ else:
+ csv_writer.writerow([ state, district, cmplx ])
except Exception as e:
print(f"Error scraping {state}: {e}")
finally:
@@ -43,7 +49,7 @@ def scrape_courts():
states = m.scrape_states()
m.driver.close()
- with ThreadPoolExecutor(max_workers=5) as executor:
+ with ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(scrape_state_thread, state, config, csv_writer)
for state in states
diff --git a/scrape_ecourtindia_v6/scraper.py b/scrape_ecourtindia_v6/scraper.py
index cdab2fd..18b519a 100644
--- a/scrape_ecourtindia_v6/scraper.py
+++ b/scrape_ecourtindia_v6/scraper.py
@@ -82,6 +82,21 @@ class Scraper:
return complexes
+ def scrape_establishments(self, state, district, cmplx):
+ self.select('sess_state_code', state)
+ sleep(0.2)
+ self.select('sess_dist_code', district)
+ sleep(0.2)
+ self.select('court_complex_code', cmplx)
+ sleep(1)
+
+ element = self.driver.find_element(By.ID, 'court_est_code')
+ options = Select(element).options
+ establishments = [ option.text for option in options[1:] ]
+ print(f'ESTABLISHMENTS: {establishments}')
+
+ return establishments
+
def select_court(self):
sleep(2)
while True: