diff options
| author | Raghuram Subramani <raghus2247@gmail.com> | 2025-03-26 22:19:19 +0530 |
|---|---|---|
| committer | Raghuram Subramani <raghus2247@gmail.com> | 2025-03-26 22:19:19 +0530 |
| commit | f1f43d3448bc879eed55f1e6865c06e646b7eb4a (patch) | |
| tree | 3b67eefca59ffc4bd46b418ec401a3c36b753542 | |
| parent | ef63d21480f1f83a660902da3f9ad2d5606b37c2 (diff) | |
implement scraping of establishments
| -rw-r--r-- | scrape_ecourtindia_v6/main.py | 10 | ||||
| -rw-r--r-- | scrape_ecourtindia_v6/scraper.py | 15 |
2 files changed, 23 insertions, 2 deletions
diff --git a/scrape_ecourtindia_v6/main.py b/scrape_ecourtindia_v6/main.py index 1cadad2..9d4c193 100644 --- a/scrape_ecourtindia_v6/main.py +++ b/scrape_ecourtindia_v6/main.py @@ -6,6 +6,8 @@ import threading db = TinyDB('db.json') +SCRAPE_ESTABLISHMENTS = True + class ThreadSafeCSVWriter: def __init__(self, filename): self.file = open(filename, 'w', newline='') @@ -25,7 +27,11 @@ def scrape_state_thread(state, config, csv_writer): try: for district in scraper.scrape_districts(state): for cmplx in scraper.scrape_complexes(state, district): - csv_writer.writerow([state, district, cmplx]) + if SCRAPE_ESTABLISHMENTS: + for establishment in scraper.scrape_establishments(state, district, cmplx): + csv_writer.writerow([ state, district, cmplx, establishment ]) + else: + csv_writer.writerow([ state, district, cmplx ]) except Exception as e: print(f"Error scraping {state}: {e}") finally: @@ -43,7 +49,7 @@ def scrape_courts(): states = m.scrape_states() m.driver.close() - with ThreadPoolExecutor(max_workers=5) as executor: + with ThreadPoolExecutor(max_workers=10) as executor: futures = [ executor.submit(scrape_state_thread, state, config, csv_writer) for state in states diff --git a/scrape_ecourtindia_v6/scraper.py b/scrape_ecourtindia_v6/scraper.py index cdab2fd..18b519a 100644 --- a/scrape_ecourtindia_v6/scraper.py +++ b/scrape_ecourtindia_v6/scraper.py @@ -82,6 +82,21 @@ class Scraper: return complexes + def scrape_establishments(self, state, district, cmplx): + self.select('sess_state_code', state) + sleep(0.2) + self.select('sess_dist_code', district) + sleep(0.2) + self.select('court_complex_code', cmplx) + sleep(1) + + element = self.driver.find_element(By.ID, 'court_est_code') + options = Select(element).options + establishments = [ option.text for option in options[1:] ] + print(f'ESTABLISHMENTS: {establishments}') + + return establishments + def select_court(self): sleep(2) while True: |
