aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--flake.nix46
-rw-r--r--scrape_ecourtindia_v6/.gitignore6
-rw-r--r--scrape_ecourtindia_v6/modules/scraper.py13
-rw-r--r--scrape_ecourtindia_v6/modules/scraper_case_status.py58
-rw-r--r--scrape_ecourtindia_v6/results/scraping_results.csv1
-rw-r--r--scrape_ecourtindia_v6/scrape_case_status.py134
-rw-r--r--scrape_ecourtindia_v6/scrape_case_status_states.py70
-rw-r--r--scrape_ecourtindia_v6/translate_to_english.py42
-rw-r--r--test/.gitignore2
-rw-r--r--test/transcribe.py14
10 files changed, 252 insertions, 134 deletions
diff --git a/flake.nix b/flake.nix
index 807fa45..93bca92 100644
--- a/flake.nix
+++ b/flake.nix
@@ -2,27 +2,33 @@
inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
outputs = { self, nixpkgs, ... }: let
- pkgs = import nixpkgs { system = "x86_64-linux"; config.allowUnfree = true; };
- in {
- devShells.x86_64-linux.default = pkgs.mkShell {
- buildInputs = with pkgs; [
- (python3.withPackages (p: [
- p.selenium
- p.opencv-python
- p.pytesseract
- p.beautifulsoup4
- p.tinydb
- p.fastapi
- p.uvicorn
- p.jinja2
- ]))
- pyright
+ system = "x86_64-linux";
+ pkgs = import nixpkgs { inherit system; config.allowUnfree = true; };
+ in {
+ devShells.${system}.default = pkgs.mkShell {
+ buildInputs = with pkgs; [
+ (python3.withPackages (p: [
+ p.selenium
+ p.opencv-python
+ p.pytesseract
+ p.beautifulsoup4
+ p.tinydb
+ p.fastapi
+ p.uvicorn
+ p.jinja2
- firefox
- geckodriver
+ # p.pdf2image
+ # p.openai-whisper
+ # p.torch-bin
+ ]))
- tesseract
- ];
- };
+ pyright
+
+ firefox
+ geckodriver
+
+ tesseract
+ ];
};
+ };
}
diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore
index f32422f..1aed0d4 100644
--- a/scrape_ecourtindia_v6/.gitignore
+++ b/scrape_ecourtindia_v6/.gitignore
@@ -1,6 +1,8 @@
-courts.csv
+*.csv
csv/*
named_pdf/*
pdf/*
html/*
-orders.json
+bak/
+translated/*
+*.json
diff --git a/scrape_ecourtindia_v6/modules/scraper.py b/scrape_ecourtindia_v6/modules/scraper.py
index 4616763..140302e 100644
--- a/scrape_ecourtindia_v6/modules/scraper.py
+++ b/scrape_ecourtindia_v6/modules/scraper.py
@@ -20,8 +20,14 @@ class Scraper:
sleep(1)
def select(self, i_d, value):
- sleep(1)
- element = self.driver.find_element(By.ID, i_d)
+ while True:
+ try:
+ element = self.driver.find_element(By.ID, i_d)
+ break
+ except:
+ sleep(0.2)
+ pass
+
select = Select(element)
select.select_by_visible_text(value)
sleep(1)
@@ -52,6 +58,9 @@ class Scraper:
return complexes
+ def establishments_visible(self):
+ return self.driver.find_element(By.ID, 'court_est_code').is_displayed()
+
def scrape_establishments(self):
element = self.driver.find_element(By.ID, 'court_est_code')
options = Select(element).options
diff --git a/scrape_ecourtindia_v6/modules/scraper_case_status.py b/scrape_ecourtindia_v6/modules/scraper_case_status.py
index 684d9d7..b4a9ec3 100644
--- a/scrape_ecourtindia_v6/modules/scraper_case_status.py
+++ b/scrape_ecourtindia_v6/modules/scraper_case_status.py
@@ -5,7 +5,6 @@ import uuid
from urllib import request
from selenium.webdriver.common.by import By
-from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
@@ -13,45 +12,30 @@ import cv2
import pytesseract
import tempfile
-from tinydb import TinyDB
-
from .scraper import Scraper
class ScraperCaseStatus(Scraper):
- def __init__(self, config):
- Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index')
-
- self.db = TinyDB('db.json')
- self.config = config
+ def __init__(self):
+ Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index', headless=False)
- def select_act(self):
- self.select('actcode', self.config['act'])
+ def select_act(self, act):
+ self.select('actcode', act)
sleep(1)
# Disposed only
self.driver.find_element(By.ID, 'radDAct').click()
self.submit_search()
- def select_court(self):
- sleep(2)
+ def goto_acts(self):
while True:
- self.select('sess_state_code', self.config['state'])
- self.select('sess_dist_code', self.config['district'])
- self.select('court_complex_code', self.config['court_complex'])
-
- sleep(2)
- modal_is_open = self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed()
- if modal_is_open:
+ try:
self.close_modal()
- continue
-
- break
-
- self.select('court_est_code', self.config['court_establishment'])
+ element = self.driver.find_element(By.ID, 'act-tabMenu')
+ element.click()
+ break
+ except:
+ pass
- def goto_acts(self):
- element = self.driver.find_element(By.ID, 'act-tabMenu')
- element.click()
sleep(1)
def submit_search(self):
@@ -77,8 +61,12 @@ class ScraperCaseStatus(Scraper):
else:
captcha_incomplete = False
- def handle_table(self):
- table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
+ def handle_table(self, db):
+ try:
+ table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
+ except:
+ return
+
self.rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
self.views = []
i = 5
@@ -109,7 +97,7 @@ class ScraperCaseStatus(Scraper):
self.parse_orders_table()
- self.db.insert(self.current_view)
+ db.insert(self.current_view)
print(f'INSERTED: {self.current_view}')
self.driver.find_element(By.ID, 'main_back_act').click()
i += 4
@@ -134,7 +122,7 @@ class ScraperCaseStatus(Scraper):
script = order.find_all('a')[0].get_attribute_list('onclick')[0]
self.driver.execute_script(script)
- sleep(0.7)
+ sleep(1)
obj = self.driver.find_element(By.TAG_NAME, 'object')
pdf_url = str(obj.get_attribute('data'))
@@ -153,4 +141,10 @@ class ScraperCaseStatus(Scraper):
except:
print(f'UNABLE TO FETCH PDF: {pdf_url}')
- self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
+ sleep(1)
+ while True:
+ try:
+ self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
+ break
+ except:
+ pass
diff --git a/scrape_ecourtindia_v6/results/scraping_results.csv b/scrape_ecourtindia_v6/results/scraping_results.csv
new file mode 100644
index 0000000..35dff1a
--- /dev/null
+++ b/scrape_ecourtindia_v6/results/scraping_results.csv
@@ -0,0 +1 @@
+State,District,Complex,Establishment,Records
diff --git a/scrape_ecourtindia_v6/scrape_case_status.py b/scrape_ecourtindia_v6/scrape_case_status.py
index 2b543ba..a8891fd 100644
--- a/scrape_ecourtindia_v6/scrape_case_status.py
+++ b/scrape_ecourtindia_v6/scrape_case_status.py
@@ -1,89 +1,67 @@
-import csv
+from time import sleep
from modules.scraper_case_status import ScraperCaseStatus
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import threading
+from tinydb import TinyDB
-SCRAPE_ESTABLISHMENTS = True
+db = TinyDB('db.json')
-class ThreadSafeCSVWriter:
- def __init__(self, filename):
- self.file = open(filename, 'w', newline='')
- self.writer = csv.writer(self.file)
- self.lock = threading.Lock()
+scraper = ScraperCaseStatus()
- def writerow(self, row):
- with self.lock:
- self.writer.writerow(row)
+state = 'Karnataka'
+act = 'Juvenile Justice (Care and Protection of Children) Act, 2015'
- def close(self):
- self.file.close()
+scraper.close_modal()
+scraper.select('sess_state_code', state)
+sleep(1)
-def scrape_state_thread(state, config, csv_writer):
- scraper = ScraperCaseStatus(config)
- scraper.close_modal()
- try:
- scraper.select('sess_state_code', state)
- for district in scraper.scrape_districts():
+for district in scraper.scrape_districts():
+ print(f'SELECTING DISTRICT {district}')
+ while True:
+ try:
+ scraper.close_modal()
scraper.select('sess_dist_code', district)
- for cmplx in scraper.scrape_complexes():
+ break
+ except:
+ pass
+ sleep(1)
+
+ for cmplx in scraper.scrape_complexes():
+ sleep(1)
+ print(f'SELECTING COMPLEX {cmplx}')
+ while True:
+ try:
+ scraper.close_modal()
scraper.select('court_complex_code', cmplx)
- if SCRAPE_ESTABLISHMENTS:
- establishments = []
- for establishment in scraper.scrape_establishments():
- establishments.append(establishment)
-
- csv_writer.writerow([ state, district, cmplx ] + establishments)
- else:
- csv_writer.writerow([ state, district, cmplx ])
- except Exception as e:
- print(f"Error scraping {state}: {e}")
- finally:
- scraper.driver.quit()
-
-def scrape_courts():
- config = {}
-
- m = ScraperCaseStatus(config)
- m.close_modal()
-
- csv_writer = ThreadSafeCSVWriter('csv/courts.csv')
- csv_writer.writerow(['State', 'District', 'Complex'])
-
- states = m.scrape_states()
- m.driver.close()
-
- with ThreadPoolExecutor(max_workers=5) as executor:
- futures = [
- executor.submit(scrape_state_thread, state, config, csv_writer)
- for state in states
- ]
-
- for future in as_completed(futures):
+ break
+ except:
+ pass
+ try:
+ scraper.driver.switch_to.alert.accept();
+ scraper.close_modal()
+ except:
+ pass
+
+ for establishment in scraper.scrape_establishments():
+ sleep(1)
+ print(f'SELECTING ESTABLISHMENT {establishment}')
+ while True:
+ try:
+ scraper.close_modal()
+ scraper.select('court_est_code', establishment)
+ break
+ except Exception as e:
+ print("EXCEPTION HANDLED:")
+ print(e)
+
+ sleep(1)
+ scraper.close_modal()
+
+ sleep(1)
+ scraper.goto_acts()
try:
- future.result()
+ scraper.select_act(act)
+ scraper.handle_table(db)
except Exception as e:
- print(f"A thread encountered an error: {e}")
-
- csv_writer.close()
-
-def scrape_orders():
- config = {}
-
- m = ScraperCaseStatus(config)
- m.close_modal()
-
- config['state'] = input('Select a state: ')
- config['district'] = input('Select a district: ')
- config['court_complex'] = input('Select a court complex: ')
- config['court_establishment'] = input('Select a court establishment: ')
- config['act'] = input('Select an act: ')
-
- m.select_court()
- m.goto_acts()
- m.select_act()
- m.handle_table()
-
- m.driver.close()
+ print("EXCEPTION HANDLED:")
+ print(e)
-if __name__ == '__main__':
- scrape_courts()
+scraper.driver.close()
diff --git a/scrape_ecourtindia_v6/scrape_case_status_states.py b/scrape_ecourtindia_v6/scrape_case_status_states.py
new file mode 100644
index 0000000..e75af84
--- /dev/null
+++ b/scrape_ecourtindia_v6/scrape_case_status_states.py
@@ -0,0 +1,70 @@
+import csv
+from modules.scraper_case_status import ScraperCaseStatus
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+
+SCRAPE_ESTABLISHMENTS = True
+
+class ThreadSafeCSVWriter:
+ def __init__(self, filename):
+ self.file = open(filename, 'w', newline='')
+ self.writer = csv.writer(self.file)
+ self.lock = threading.Lock()
+
+ def writerow(self, row):
+ with self.lock:
+ self.writer.writerow(row)
+
+ def close(self):
+ self.file.close()
+
+def scrape_state_thread(state, config, csv_writer):
+ scraper = ScraperCaseStatus(config)
+ scraper.close_modal()
+ try:
+ scraper.select('sess_state_code', state)
+ for district in scraper.scrape_districts():
+ scraper.select('sess_dist_code', district)
+ for cmplx in scraper.scrape_complexes():
+ scraper.select('court_complex_code', cmplx)
+ if SCRAPE_ESTABLISHMENTS:
+ establishments = []
+ for establishment in scraper.scrape_establishments():
+ establishments.append(establishment)
+
+ csv_writer.writerow([ state, district, cmplx ] + establishments)
+ else:
+ csv_writer.writerow([ state, district, cmplx ])
+ except Exception as e:
+ print(f"Error scraping {state}: {e}")
+ finally:
+ scraper.driver.quit()
+
+def scrape_courts():
+ config = {}
+
+ m = ScraperCaseStatus(config)
+ m.close_modal()
+
+ csv_writer = ThreadSafeCSVWriter('csv/courts.csv')
+ csv_writer.writerow(['State', 'District', 'Complex'])
+
+ states = m.scrape_states()
+ m.driver.close()
+
+ with ThreadPoolExecutor(max_workers=5) as executor:
+ futures = [
+ executor.submit(scrape_state_thread, state, config, csv_writer)
+ for state in states
+ ]
+
+ for future in as_completed(futures):
+ try:
+ future.result()
+ except Exception as e:
+ print(f"A thread encountered an error: {e}")
+
+ csv_writer.close()
+
+if __name__ == '__main__':
+ scrape_courts()
diff --git a/scrape_ecourtindia_v6/translate_to_english.py b/scrape_ecourtindia_v6/translate_to_english.py
new file mode 100644
index 0000000..485a4b8
--- /dev/null
+++ b/scrape_ecourtindia_v6/translate_to_english.py
@@ -0,0 +1,42 @@
+from tempfile import TemporaryDirectory
+
+import pytesseract
+from pdf2image import convert_from_path
+from PIL import Image
+
+from tinydb import TinyDB
+
+language = 'hin'
+
+def to_english(input_file, output_file):
+ image_file_list = []
+
+ with TemporaryDirectory() as tempdir:
+ pdf_pages = convert_from_path(input_file, 500)
+
+ for page_enumeration, page in enumerate(pdf_pages, start=1):
+ filename = f"{tempdir}/page_{page_enumeration}.jpg"
+ page.save(filename, "JPEG")
+ image_file_list.append(filename)
+
+ with open(output_file, "a") as h:
+ for image_file in image_file_list:
+ text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language))))
+
+ # In many PDFs, at line ending, if a word can't
+ # be written fully, a 'hyphen' is added.
+ # The rest of the word is written in the next line
+ # Eg: This is a sample text this word here GeeksF-
+ # orGeeks is half on first line, remaining on next.
+ # To remove this, we replace every '-\n' to ''.
+ text = text.replace("-\n", "")
+
+ breakpoint()
+
+ h.write(text)
+
+db = TinyDB('orders.json')
+entries = db.all()
+
+for entry in entries:
+ to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt')
diff --git a/test/.gitignore b/test/.gitignore
new file mode 100644
index 0000000..818a333
--- /dev/null
+++ b/test/.gitignore
@@ -0,0 +1,2 @@
+*.txt
+*.mp3
diff --git a/test/transcribe.py b/test/transcribe.py
new file mode 100644
index 0000000..c64f425
--- /dev/null
+++ b/test/transcribe.py
@@ -0,0 +1,14 @@
+import os
+import whisper
+
+def transcribe_audio(audio_file_path, model_path):
+ model = whisper.load_model(model_path)
+ result = model.transcribe(audio_file_path)
+ text_file_path = os.path.splitext(audio_file_path)[0] + ".txt"
+ with open(text_file_path, "w") as text_file:
+ text_file.write(result['text'])
+
+audio_file_path = 'test.mp3'
+
+if audio_file_path is not None:
+ transcribe_audio(audio_file_path, model_path='medium')