diff options
author | Raghuram Subramani <raghus2247@gmail.com> | 2025-05-11 19:20:58 +0530 |
---|---|---|
committer | Raghuram Subramani <raghus2247@gmail.com> | 2025-05-11 19:20:58 +0530 |
commit | a44d6029f2aed0389750ce0cd694a369450fef57 (patch) | |
tree | b98a254b308d7fea6ddcb0ffc347d1ebeadce935 | |
parent | af8665ad224f923bdffd26fa75e7d4a8714b976c (diff) |
update
-rw-r--r-- | web/app/job_manager.py | 14 | ||||
-rw-r--r-- | web/app/jobs/scrape_cases.py | 40 | ||||
-rw-r--r-- | web/app/main.py | 8 | ||||
-rw-r--r-- | web/app/templates/home.html | 5 |
4 files changed, 36 insertions, 31 deletions
diff --git a/web/app/job_manager.py b/web/app/job_manager.py index abec11d..fa486f3 100644 --- a/web/app/job_manager.py +++ b/web/app/job_manager.py @@ -8,18 +8,22 @@ class JobManager: self.redis = Redis() self.q = Queue(connection=self.redis) - def enqueue_scrape(self, name, acts, section, state_code): + def enqueue_scrape(self, name, acts, sections, state_code): # 4 hour timeout return self.q.enqueue( scrape_cases, name, acts, - section, + sections, state_code, job_timeout=14400 ) - def get_started_jobs(self): + def get_jobs(self): started_job_ids = self.q.started_job_registry.get_job_ids() - jobs = [Job.fetch(job_id, connection=self.redis) for job_id in started_job_ids] - return jobs + started_jobs = [Job.fetch(job_id, connection=self.redis) for job_id in started_job_ids] + + finished_job_ids = self.q.finished_job_registry.get_job_ids() + finished_jobs = [Job.fetch(job_id, connection=self.redis) for job_id in finished_job_ids] + + return started_jobs + finished_jobs diff --git a/web/app/jobs/scrape_cases.py b/web/app/jobs/scrape_cases.py index 9cd4930..7a944f1 100644 --- a/web/app/jobs/scrape_cases.py +++ b/web/app/jobs/scrape_cases.py @@ -4,7 +4,8 @@ from bs4 import BeautifulSoup import time import csv -def scrape_cases(name, acts, section, state_code): +def scrape_cases(name, acts, sections, state_code): + acts = set(acts) db = TinyDB(f'app/outputs/{name}.json') interface = Interface() @@ -31,29 +32,30 @@ def scrape_cases(name, acts, section, state_code): print(f'ESTABLISHMENT: {i}/{len(court_establishments)}') for act in acts: - try: - cases = interface.search_by_act(state_code, dist_code, court_establishment, act, section) - except Exception as e: - print(f"[ERROR] Failed to scrape cases in complex {complex_name}: {e}") - continue - - for j, case in enumerate(cases, 1): - print(f'CASE: {j}/{len(cases)}') - + for section in sections: try: - case_no = case['case_no'] - case_history = interface.case_history(state_code, dist_code, court_establishment, case_no) + cases = interface.search_by_act(state_code, dist_code, court_establishment, act, section) except Exception as e: - print(f"[ERROR] Failed to get history for case {case.get('case_no', 'UNKNOWN')}: {e}") + print(f"[ERROR] Failed to scrape cases in complex {complex_name}: {e}") continue - try: - case_history['case_no'] = case_no - case_history['complex_name'] = complex_name - db.insert(case_history) + for j, case in enumerate(cases, 1): + print(f'CASE: {j}/{len(cases)}') - except Exception as e: - print(f"[ERROR] Failed to parse orders for case {case_no}: {e}") + try: + case_no = case['case_no'] + case_history = interface.case_history(state_code, dist_code, court_establishment, case_no) + except Exception as e: + print(f"[ERROR] Failed to get history for case {case.get('case_no', 'UNKNOWN')}: {e}") + continue + + try: + case_history['case_no'] = case_no + case_history['complex_name'] = complex_name + db.insert(case_history) + + except Exception as e: + print(f"[ERROR] Failed to parse orders for case {case_no}: {e}") entries = db.all() diff --git a/web/app/main.py b/web/app/main.py index cc3f995..1266cb8 100644 --- a/web/app/main.py +++ b/web/app/main.py @@ -21,7 +21,7 @@ main = Blueprint('main', __name__) @main.route('/') @login_required def home(): - jobs = job_manager.get_started_jobs() + jobs = job_manager.get_jobs() return render_template('home.html', user=current_user, states=states, acts=act_list, jobs=jobs) @main.route('/logout') @@ -57,14 +57,14 @@ def create_user(): @login_required def enqueue_job(): acts = request.form.getlist('act') - section = request.form.get('section') + sections = request.form.get('section').split(',') state_code = request.form.get('state_code') name = request.form.get('name') if not section: section = '' - job = job_manager.enqueue_scrape(f'{name} - {time.time_ns()}', acts, section, state_code) + job = job_manager.enqueue_scrape(f'{name} - {time.time_ns()}', acts, sections, state_code) flash('Job created.', 'info') return redirect(url_for('main.home')) @@ -72,5 +72,5 @@ def enqueue_job(): @main.route('/download/<filename>') @login_required def download_output(filename): - output_dir = os.path.join(os.getcwd(), 'outputs') + output_dir = os.path.join(os.getcwd(), 'app/outputs') return send_from_directory(output_dir, f'{filename}.csv', as_attachment=True) diff --git a/web/app/templates/home.html b/web/app/templates/home.html index 809269d..d35767a 100644 --- a/web/app/templates/home.html +++ b/web/app/templates/home.html @@ -30,7 +30,8 @@ {% endfor %} </select> - <input type="text" name="section" placeholder="Section"> + <label for="act">Comma Separated Section List</label> + <input type="text" name="section" placeholder="Sections"> <select name="state_code"> {% for code, name in states %} @@ -56,7 +57,6 @@ <th scope="col">Job Name</th> <th scope="col">Job Status</th> <th scope="col">Output</th> - <th scope="col">Log</th> </tr> </thead> <tbody> @@ -65,7 +65,6 @@ <td>{{ job.args[0] }}</td> <td>{{ job._status }}</td> <td><a href="{{ url_for('main.download_output', filename=job.args[0]) }}">Download</a></td> - <td>Running</td> </tr> {% endfor %} </tbody> |