diff options
author | Raghuram Subramani <raghus2247@gmail.com> | 2025-05-11 19:00:48 +0530 |
---|---|---|
committer | Raghuram Subramani <raghus2247@gmail.com> | 2025-05-11 19:00:48 +0530 |
commit | af8665ad224f923bdffd26fa75e7d4a8714b976c (patch) | |
tree | 006a5f1ec20a5f6c413f4d21559d4cbd8b1570ca /web/app/jobs/scrape_cases.py | |
parent | 7e6d28b39ec3f706d86280804011f7436df90851 (diff) |
update web
Diffstat (limited to 'web/app/jobs/scrape_cases.py')
-rw-r--r-- | web/app/jobs/scrape_cases.py | 160 |
1 files changed, 125 insertions, 35 deletions
diff --git a/web/app/jobs/scrape_cases.py b/web/app/jobs/scrape_cases.py index ec31f8a..9cd4930 100644 --- a/web/app/jobs/scrape_cases.py +++ b/web/app/jobs/scrape_cases.py @@ -1,16 +1,13 @@ -from modules.interface import Interface +from app.modules.interface import Interface from tinydb import TinyDB +from bs4 import BeautifulSoup import time +import csv -def scrape_cases(act, section, state_code, name=time.time_ns()): - db = TinyDB(f'{name}.json') +def scrape_cases(name, acts, section, state_code): + db = TinyDB(f'app/outputs/{name}.json') interface = Interface() - def get_act_number(acts): - for act_code, act_name in acts: - if act_name == act: - return act_code - return None try: districts = interface.get_districts(state_code) except Exception as e: @@ -33,36 +30,129 @@ def scrape_cases(act, section, state_code, name=time.time_ns()): for i, court_establishment in enumerate(court_establishments, 1): print(f'ESTABLISHMENT: {i}/{len(court_establishments)}') - try: - acts = interface.get_acts(state_code, dist_code, court_establishment) - act_number = get_act_number(acts) - except Exception as e: - print(f"[ERROR] Failed to scrape acts for complex {complex_name}: {e}") - continue + for act in acts: + try: + cases = interface.search_by_act(state_code, dist_code, court_establishment, act, section) + except Exception as e: + print(f"[ERROR] Failed to scrape cases in complex {complex_name}: {e}") + continue - if not act_number: - continue + for j, case in enumerate(cases, 1): + print(f'CASE: {j}/{len(cases)}') - try: - cases = interface.search_by_act(state_code, dist_code, court_establishment, act_number, section) - except Exception as e: - print(f"[ERROR] Failed to scrape cases in complex {complex_name}: {e}") - continue + try: + case_no = case['case_no'] + case_history = interface.case_history(state_code, dist_code, court_establishment, case_no) + except Exception as e: + print(f"[ERROR] Failed to get history for case {case.get('case_no', 'UNKNOWN')}: {e}") + continue - for j, case in enumerate(cases, 1): - print(f'CASE: {j}/{len(cases)}') + try: + case_history['case_no'] = case_no + case_history['complex_name'] = complex_name + db.insert(case_history) - try: - case_no = case['case_no'] - case_history = interface.case_history(state_code, dist_code, court_establishment, case_no) - except Exception as e: - print(f"[ERROR] Failed to get history for case {case.get('case_no', 'UNKNOWN')}: {e}") - continue + except Exception as e: + print(f"[ERROR] Failed to parse orders for case {case_no}: {e}") + + entries = db.all() - try: - case_history['case_no'] = case_no - case_history['complex_name'] = complex_name - db.insert(case_history) + key_mapping = { + 'case_no': 'Case Number', + 'cino': 'CNR Number', + 'type_name': 'Case Type', - except Exception as e: - print(f"[ERROR] Failed to parse orders for case {case_no}: {e}") + 'reg_no': 'Registration Number', + 'reg_year': 'Registration Year', + + 'district_name': 'District', + 'complex_name': 'Complex Name', + 'court_name': 'Court Name', + + 'dt_regis': 'Registration Date', + 'date_of_filing': 'Date of Filing', + 'date_of_decision': 'Date of Decision', + 'disp_name': 'Disposition', + + 'acts': 'Acts', + + 'pet_name': 'Petitioner', + 'pet_adv': 'Petitioner Advocate', + 'petparty_name': 'Petitioner Party Name', + + 'res_name': 'Respondent', + 'res_adv': 'Respondent Advocate', + 'resparty_name': 'Respondent Party Name' + } + + all_acts = [] + + for entry in entries: + soup = BeautifulSoup(entry.get('finalOrder') or '', features="html.parser") + final_orders = [] + for row in soup.select('table.tbl-result tbody tr'): + cells = row.find_all('td') + if len(cells) >= 2: + order_date = cells[1].get_text(strip=True) + link_tag = cells[2].find('a', href=True) if len(cells) > 2 else None + if link_tag: + final_orders.append({'date': order_date, 'link': link_tag['href']}) + + soup = BeautifulSoup(entry.get('interimOrder') or '', features="html.parser") + interim_orders = [] + for row in soup.select('table.tbl-result tbody tr'): + cells = row.find_all('td') + if len(cells) >= 2: + order_date = cells[1].get_text(strip=True) + link_tag = cells[2].find('a', href=True) if len(cells) > 2 else None + if link_tag: + interim_orders.append({'date': order_date, 'link': link_tag['href']}) + + act_html = entry.get('act', '') + soup = BeautifulSoup(act_html, 'html.parser') + + acts = [] + for row in soup.select('tbody tr'): + cells = row.find_all('td') + if len(cells) == 2: + act = cells[0].get_text(strip=True) + section = cells[1].get_text(strip=True) + if act not in all_acts: + all_acts.append(act) + + acts.append(f"{act}: {section}") + + entry['acts'] = '\n'.join(acts) + entry['final_orders'] = final_orders + entry['interim_orders'] = interim_orders + + max_final = max(len(entry.get('final_orders', [])) for entry in entries) + max_interim = max(len(entry.get('interim_orders', [])) for entry in entries) + + with open(f'app/outputs/{name}.csv', 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + + headers = list(key_mapping.values()) + + headers += [f'Final Order {i+1}' for i in range(max_final)] + headers += [f'Interim Order {i+1}' for i in range(max_interim)] + writer.writerow(headers) + + for entry in entries: + row = [] + for key in key_mapping: + row.append(entry.get(key, '')) + + final_orders = entry.get('final_orders', []) + for order in final_orders: + hyperlink = f'=HYPERLINK("{order["link"]}", "{order["date"]}")' + row.append(hyperlink) + row += [''] * (max_final - len(final_orders)) + + interim_orders = entry.get('interim_orders', []) + for order in interim_orders: + hyperlink = f'=HYPERLINK("{order["link"]}", "{order["date"]}")' + row.append(hyperlink) + row += [''] * (max_interim - len(interim_orders)) + + writer.writerow(row) |