aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--web/app/.gitignore1
-rw-r--r--web/app/job_manager.py5
-rw-r--r--web/app/jobs/scrape_cases.py233
-rw-r--r--web/app/main.py14
-rw-r--r--web/app/templates/home.html7
5 files changed, 123 insertions, 137 deletions
diff --git a/web/app/.gitignore b/web/app/.gitignore
new file mode 100644
index 0000000..17aa483
--- /dev/null
+++ b/web/app/.gitignore
@@ -0,0 +1 @@
+outputs/
diff --git a/web/app/job_manager.py b/web/app/job_manager.py
index bdbe994..3d588a6 100644
--- a/web/app/job_manager.py
+++ b/web/app/job_manager.py
@@ -25,7 +25,4 @@ class JobManager:
started_job_ids = self.q.started_job_registry.get_job_ids()
started_jobs = [Job.fetch(job_id, connection=self.redis) for job_id in started_job_ids]
- finished_job_ids = self.q.finished_job_registry.get_job_ids()
- finished_jobs = [Job.fetch(job_id, connection=self.redis) for job_id in finished_job_ids]
-
- return queued_jobs + started_jobs + finished_jobs
+ return queued_jobs + started_jobs
diff --git a/web/app/jobs/scrape_cases.py b/web/app/jobs/scrape_cases.py
index 7a944f1..237acbc 100644
--- a/web/app/jobs/scrape_cases.py
+++ b/web/app/jobs/scrape_cases.py
@@ -1,160 +1,141 @@
from app.modules.interface import Interface
-from tinydb import TinyDB
from bs4 import BeautifulSoup
-import time
import csv
-def scrape_cases(name, acts, sections, state_code):
- acts = set(acts)
- db = TinyDB(f'app/outputs/{name}.json')
- interface = Interface()
+from tinydb import TinyDB
+
+db = TinyDB('app/jobs.json')
+def get_districts(interface, state_code):
try:
- districts = interface.get_districts(state_code)
+ return interface.get_districts(state_code)
except Exception as e:
print(f"[ERROR] Failed to scrape districts: {e}")
- districts = []
+ return []
+
+def get_complexes(interface, state_code, dist_code, dist_name):
+ try:
+ return interface.get_complexes(state_code, dist_code)
+ except Exception as e:
+ print(f"[ERROR] Failed to scrape complexes for {dist_name}: {e}")
+ return []
+
+def fetch_cases(interface, state_code, dist_code, court_establishment, act, section, complex_name):
+ try:
+ return interface.search_by_act(state_code, dist_code, court_establishment, act, section)
+ except Exception as e:
+ print(f"[ERROR] Failed to scrape cases in complex {complex_name}: {e}")
+ return []
+
+def fetch_case_history(interface, state_code, dist_code, court_establishment, case_no):
+ try:
+ return interface.case_history(state_code, dist_code, court_establishment, case_no)
+ except Exception as e:
+ print(f"[ERROR] Failed to get history for case {case_no}: {e}")
+ return None
+
+def parse_orders(order_html):
+ soup = BeautifulSoup(order_html or '', features="html.parser")
+ orders = []
+ for row in soup.select('table.tbl-result tbody tr'):
+ cells = row.find_all('td')
+ if len(cells) >= 2:
+ order_date = cells[1].get_text(strip=True)
+ link_tag = cells[2].find('a', href=True) if len(cells) > 2 else None
+ if link_tag:
+ orders.append({'date': order_date, 'link': link_tag['href']})
+ return orders
+
+def parse_acts(entry, all_acts):
+ soup = BeautifulSoup(entry.get('act', ''), 'html.parser')
+ acts = []
+ for row in soup.select('tbody tr'):
+ cells = row.find_all('td')
+ if len(cells) == 2:
+ act = cells[0].get_text(strip=True)
+ section = cells[1].get_text(strip=True)
+ if act not in all_acts:
+ all_acts.append(act)
+ acts.append(f"{act}: {section}")
+ return '\n'.join(acts)
+
+def write_to_csv(entries, key_mapping, name):
+ max_final = max(len(entry.get('final_orders', [])) for entry in entries)
+ max_interim = max(len(entry.get('interim_orders', [])) for entry in entries)
+
+ with open(f'app/outputs/{name}.csv', 'w', newline='', encoding='utf-8') as csvfile:
+ writer = csv.writer(csvfile)
+ headers = list(key_mapping.values()) + \
+ [f'Final Order {i+1}' for i in range(max_final)] + \
+ [f'Interim Order {i+1}' for i in range(max_interim)]
+ writer.writerow(headers)
+
+ for entry in entries:
+ row = [entry.get(key, '') for key in key_mapping]
+
+ for order in entry.get('final_orders', []):
+ row.append(f'=HYPERLINK("{order["link"]}", "{order["date"]}")')
+ row += [''] * (max_final - len(entry.get('final_orders', [])))
+
+ for order in entry.get('interim_orders', []):
+ row.append(f'=HYPERLINK("{order["link"]}", "{order["date"]}")')
+ row += [''] * (max_interim - len(entry.get('interim_orders', [])))
+
+ writer.writerow(row)
+def scrape_cases(name, acts, sections, state_code):
+ acts = set(acts)
+ entries = []
+ interface = Interface()
+
+ districts = get_districts(interface, state_code)
for dist_code, dist_name in districts:
print(f'DISTRICT: {dist_name}')
-
- try:
- complexes = interface.get_complexes(state_code, dist_code)
- except Exception as e:
- print(f"[ERROR] Failed to scrape complexes for {dist_name}: {e}")
- continue
+ complexes = get_complexes(interface, state_code, dist_code, dist_name)
for complex_code, complex_name in complexes:
print(f'COMPLEX: {complex_name}')
-
court_establishments = str(complex_code).split(',')
+
for i, court_establishment in enumerate(court_establishments, 1):
print(f'ESTABLISHMENT: {i}/{len(court_establishments)}')
for act in acts:
for section in sections:
- try:
- cases = interface.search_by_act(state_code, dist_code, court_establishment, act, section)
- except Exception as e:
- print(f"[ERROR] Failed to scrape cases in complex {complex_name}: {e}")
- continue
+ cases = fetch_cases(interface, state_code, dist_code, court_establishment, act, section, complex_name)
for j, case in enumerate(cases, 1):
print(f'CASE: {j}/{len(cases)}')
-
- try:
- case_no = case['case_no']
- case_history = interface.case_history(state_code, dist_code, court_establishment, case_no)
- except Exception as e:
- print(f"[ERROR] Failed to get history for case {case.get('case_no', 'UNKNOWN')}: {e}")
+ case_no = case.get('case_no')
+ if not case_no:
continue
- try:
- case_history['case_no'] = case_no
- case_history['complex_name'] = complex_name
- db.insert(case_history)
+ case_history = fetch_case_history(interface, state_code, dist_code, court_establishment, case_no)
+ if not case_history:
+ continue
- except Exception as e:
- print(f"[ERROR] Failed to parse orders for case {case_no}: {e}")
-
- entries = db.all()
+ case_history['case_no'] = case_no
+ case_history['complex_name'] = complex_name
+ entries.append(case_history)
key_mapping = {
- 'case_no': 'Case Number',
- 'cino': 'CNR Number',
- 'type_name': 'Case Type',
-
- 'reg_no': 'Registration Number',
- 'reg_year': 'Registration Year',
-
- 'district_name': 'District',
- 'complex_name': 'Complex Name',
- 'court_name': 'Court Name',
-
- 'dt_regis': 'Registration Date',
- 'date_of_filing': 'Date of Filing',
- 'date_of_decision': 'Date of Decision',
- 'disp_name': 'Disposition',
-
- 'acts': 'Acts',
-
- 'pet_name': 'Petitioner',
- 'pet_adv': 'Petitioner Advocate',
- 'petparty_name': 'Petitioner Party Name',
-
- 'res_name': 'Respondent',
- 'res_adv': 'Respondent Advocate',
- 'resparty_name': 'Respondent Party Name'
+ 'case_no': 'Case Number', 'cino': 'CNR Number', 'type_name': 'Case Type',
+ 'reg_no': 'Registration Number', 'reg_year': 'Registration Year',
+ 'district_name': 'District', 'complex_name': 'Complex Name', 'court_name': 'Court Name',
+ 'dt_regis': 'Registration Date', 'date_of_filing': 'Date of Filing', 'date_of_decision': 'Date of Decision',
+ 'disp_name': 'Disposition', 'acts': 'Acts',
+ 'pet_name': 'Petitioner', 'pet_adv': 'Petitioner Advocate', 'petparty_name': 'Petitioner Party Name',
+ 'res_name': 'Respondent', 'res_adv': 'Respondent Advocate', 'resparty_name': 'Respondent Party Name'
}
all_acts = []
-
for entry in entries:
- soup = BeautifulSoup(entry.get('finalOrder') or '', features="html.parser")
- final_orders = []
- for row in soup.select('table.tbl-result tbody tr'):
- cells = row.find_all('td')
- if len(cells) >= 2:
- order_date = cells[1].get_text(strip=True)
- link_tag = cells[2].find('a', href=True) if len(cells) > 2 else None
- if link_tag:
- final_orders.append({'date': order_date, 'link': link_tag['href']})
-
- soup = BeautifulSoup(entry.get('interimOrder') or '', features="html.parser")
- interim_orders = []
- for row in soup.select('table.tbl-result tbody tr'):
- cells = row.find_all('td')
- if len(cells) >= 2:
- order_date = cells[1].get_text(strip=True)
- link_tag = cells[2].find('a', href=True) if len(cells) > 2 else None
- if link_tag:
- interim_orders.append({'date': order_date, 'link': link_tag['href']})
-
- act_html = entry.get('act', '')
- soup = BeautifulSoup(act_html, 'html.parser')
-
- acts = []
- for row in soup.select('tbody tr'):
- cells = row.find_all('td')
- if len(cells) == 2:
- act = cells[0].get_text(strip=True)
- section = cells[1].get_text(strip=True)
- if act not in all_acts:
- all_acts.append(act)
-
- acts.append(f"{act}: {section}")
-
- entry['acts'] = '\n'.join(acts)
- entry['final_orders'] = final_orders
- entry['interim_orders'] = interim_orders
+ entry['final_orders'] = parse_orders(entry.get('finalOrder'))
+ entry['interim_orders'] = parse_orders(entry.get('interimOrder'))
+ entry['acts'] = parse_acts(entry, all_acts)
- max_final = max(len(entry.get('final_orders', [])) for entry in entries)
- max_interim = max(len(entry.get('interim_orders', [])) for entry in entries)
-
- with open(f'app/outputs/{name}.csv', 'w', newline='', encoding='utf-8') as csvfile:
- writer = csv.writer(csvfile)
+ write_to_csv(entries, key_mapping, name)
- headers = list(key_mapping.values())
-
- headers += [f'Final Order {i+1}' for i in range(max_final)]
- headers += [f'Interim Order {i+1}' for i in range(max_interim)]
- writer.writerow(headers)
-
- for entry in entries:
- row = []
- for key in key_mapping:
- row.append(entry.get(key, ''))
-
- final_orders = entry.get('final_orders', [])
- for order in final_orders:
- hyperlink = f'=HYPERLINK("{order["link"]}", "{order["date"]}")'
- row.append(hyperlink)
- row += [''] * (max_final - len(final_orders))
-
- interim_orders = entry.get('interim_orders', [])
- for order in interim_orders:
- hyperlink = f'=HYPERLINK("{order["link"]}", "{order["date"]}")'
- row.append(hyperlink)
- row += [''] * (max_interim - len(interim_orders))
-
- writer.writerow(row)
+ db.insert({
+ "name": name
+ })
diff --git a/web/app/main.py b/web/app/main.py
index cc141b8..e834c0c 100644
--- a/web/app/main.py
+++ b/web/app/main.py
@@ -1,6 +1,7 @@
from flask import request, flash, send_from_directory
from flask import Blueprint, render_template, redirect, url_for
from flask_login import login_required, logout_user, current_user
+from tinydb import TinyDB
from .models import User
import json
@@ -10,8 +11,6 @@ import os
from .modules.interface import Interface
from .job_manager import JobManager
-from tinydb import TinyDB
-
states = Interface().get_states()
act_list = json.loads(open('app/acts.json').read())
@@ -22,7 +21,8 @@ main = Blueprint('main', __name__)
@login_required
def home():
jobs = job_manager.get_jobs()
- return render_template('home.html', user=current_user, states=states, acts=act_list, jobs=jobs)
+ completed_jobs = TinyDB('jobs.json').all()
+ return render_template('home.html', user=current_user, states=states, acts=act_list, completed_jobs=completed_jobs, jobs=jobs)
@main.route('/logout')
@login_required
@@ -58,14 +58,14 @@ def create_user():
@login_required
def enqueue_job():
acts = request.form.getlist('act')
- sections = request.form.get('section').split(',')
+ sections = request.form.get('section', '').split(',')
state_code = request.form.get('state_code')
name = request.form.get('name')
- if not section:
- section = ''
+ if not sections:
+ sections = ''
- job = job_manager.enqueue_scrape(f'{name} - {time.time_ns()}', acts, sections, state_code)
+ job_manager.enqueue_scrape(f'{name} - {time.time_ns()}', acts, sections, state_code)
flash('Job created.', 'info')
return redirect(url_for('main.home'))
diff --git a/web/app/templates/home.html b/web/app/templates/home.html
index 797c66d..7caff64 100644
--- a/web/app/templates/home.html
+++ b/web/app/templates/home.html
@@ -62,6 +62,13 @@
</tr>
</thead>
<tbody>
+ {% for job in completed_jobs %}
+ <tr>
+ <td>{{ job['name'] }}</td>
+ <td>COMPLETED</td>
+ <td><a href="{{ url_for('main.download_output', filename=job['name']) }}">Download</a></td>
+ </tr>
+ {% endfor %}
{% for job in jobs %}
<tr>
<td>{{ job.args[0] }}</td>