update

author: Raghuram Subramani <raghus2247@gmail.com> 2025-03-31 14:30:38 +0530
committer: Raghuram Subramani <raghus2247@gmail.com> 2025-03-31 14:30:57 +0530
commit: 0f188ea1e638e6abddb03d49b9209c703081b2fe (patch)
tree: cfe69bb82158fccf9eb4d5737d0c9c1603c5e1f1
parent: 97d1df0cd10f9f4adc1991cc8067cc8f1d3978cf (diff)
10 files changed, 252 insertions, 134 deletions
diff --git a/flake.nix b/flake.nix
index 807fa45..93bca92 100644
--- a/flake.nix
+++ b/flake.nix
@@ -2,27 +2,33 @@
   inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
 
   outputs = { self, nixpkgs, ... }: let
-      pkgs = import nixpkgs { system = "x86_64-linux"; config.allowUnfree = true; };
-    in {
-      devShells.x86_64-linux.default = pkgs.mkShell {
-        buildInputs = with pkgs; [
-          (python3.withPackages (p: [
-            p.selenium
-            p.opencv-python
-            p.pytesseract
-            p.beautifulsoup4
-            p.tinydb
-            p.fastapi
-            p.uvicorn
-            p.jinja2
-          ]))
-          pyright
+    system = "x86_64-linux";
+    pkgs = import nixpkgs { inherit system; config.allowUnfree = true; };
+  in {
+    devShells.${system}.default = pkgs.mkShell {
+      buildInputs = with pkgs; [
+        (python3.withPackages (p: [
+          p.selenium
+          p.opencv-python
+          p.pytesseract
+          p.beautifulsoup4
+          p.tinydb
+          p.fastapi
+          p.uvicorn
+          p.jinja2
 
-          firefox
-          geckodriver
+          # p.pdf2image
+          # p.openai-whisper
+          # p.torch-bin
+        ]))
 
-          tesseract
-        ];
-      };
+        pyright
+
+        firefox
+        geckodriver
+
+        tesseract
+      ];
     };
+  };
 }
diff --git a/scrape_ecourtindia_v6/.gitignore b/scrape_ecourtindia_v6/.gitignore
index f32422f..1aed0d4 100644
--- a/scrape_ecourtindia_v6/.gitignore
+++ b/scrape_ecourtindia_v6/.gitignore
@@ -1,6 +1,8 @@
-courts.csv
+*.csv
 csv/*
 named_pdf/*
 pdf/*
 html/*
-orders.json
+bak/
+translated/*
+*.json
diff --git a/scrape_ecourtindia_v6/modules/scraper.py b/scrape_ecourtindia_v6/modules/scraper.py
index 4616763..140302e 100644
--- a/scrape_ecourtindia_v6/modules/scraper.py
+++ b/scrape_ecourtindia_v6/modules/scraper.py
@@ -20,8 +20,14 @@ class Scraper:
         sleep(1)
 
     def select(self, i_d, value):
-        sleep(1)
-        element = self.driver.find_element(By.ID, i_d)
+        while True:
+            try:
+                element = self.driver.find_element(By.ID, i_d)
+                break
+            except:
+                sleep(0.2)
+                pass
+
         select = Select(element)
         select.select_by_visible_text(value)
         sleep(1)
@@ -52,6 +58,9 @@ class Scraper:
 
         return complexes
 
+    def establishments_visible(self):
+        return self.driver.find_element(By.ID, 'court_est_code').is_displayed()
+
     def scrape_establishments(self):
         element = self.driver.find_element(By.ID, 'court_est_code')
         options = Select(element).options
diff --git a/scrape_ecourtindia_v6/modules/scraper_case_status.py b/scrape_ecourtindia_v6/modules/scraper_case_status.py
index 684d9d7..b4a9ec3 100644
--- a/scrape_ecourtindia_v6/modules/scraper_case_status.py
+++ b/scrape_ecourtindia_v6/modules/scraper_case_status.py
@@ -5,7 +5,6 @@ import uuid
 from urllib import request
 
 from selenium.webdriver.common.by import By
-from selenium.webdriver.support.select import Select
 
 from bs4 import BeautifulSoup
 
@@ -13,45 +12,30 @@ import cv2
 import pytesseract
 import tempfile
 
-from tinydb import TinyDB
-
 from .scraper import Scraper
 
 class ScraperCaseStatus(Scraper):
-    def __init__(self, config):
-        Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index')
-
-        self.db = TinyDB('db.json')
-        self.config = config
+    def __init__(self):
+        Scraper.__init__(self, 'https://services.ecourts.gov.in/ecourtindia_v6/?p=casestatus/index', headless=False)
 
-    def select_act(self):
-        self.select('actcode', self.config['act'])
+    def select_act(self, act):
+        self.select('actcode', act)
         sleep(1)
 
         # Disposed only
         self.driver.find_element(By.ID, 'radDAct').click()
         self.submit_search()
 
-    def select_court(self):
-        sleep(2)
+    def goto_acts(self):
         while True:
-            self.select('sess_state_code', self.config['state'])
-            self.select('sess_dist_code', self.config['district'])
-            self.select('court_complex_code', self.config['court_complex'])
-
-            sleep(2)
-            modal_is_open = self.driver.find_element(By.CLASS_NAME, 'alert-danger-cust').is_displayed()
-            if modal_is_open:
+            try:
                 self.close_modal()
-                continue
-
-            break
-
-        self.select('court_est_code', self.config['court_establishment'])
+                element = self.driver.find_element(By.ID, 'act-tabMenu')
+                element.click()
+                break
+            except:
+                pass
 
-    def goto_acts(self):
-        element = self.driver.find_element(By.ID, 'act-tabMenu')
-        element.click()
         sleep(1)
 
     def submit_search(self):
@@ -77,8 +61,12 @@ class ScraperCaseStatus(Scraper):
             else:
                 captcha_incomplete = False
 
-    def handle_table(self):
-        table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
+    def handle_table(self, db):
+        try:
+            table_innerhtml = self.driver.find_element(By.ID, 'dispTable').get_attribute('innerHTML')
+        except:
+            return
+
         self.rows = BeautifulSoup(str(table_innerhtml), 'html.parser').find_all('td')
         self.views = []
         i = 5
@@ -109,7 +97,7 @@ class ScraperCaseStatus(Scraper):
 
             self.parse_orders_table()
 
-            self.db.insert(self.current_view)
+            db.insert(self.current_view)
             print(f'INSERTED: {self.current_view}')
             self.driver.find_element(By.ID, 'main_back_act').click()
             i += 4
@@ -134,7 +122,7 @@ class ScraperCaseStatus(Scraper):
             script = order.find_all('a')[0].get_attribute_list('onclick')[0]
             self.driver.execute_script(script)
 
-            sleep(0.7)
+            sleep(1)
             obj = self.driver.find_element(By.TAG_NAME, 'object')
             pdf_url = str(obj.get_attribute('data'))
 
@@ -153,4 +141,10 @@ class ScraperCaseStatus(Scraper):
             except:
                 print(f'UNABLE TO FETCH PDF: {pdf_url}')
 
-            self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
+            sleep(1)
+            while True:
+                try:
+                    self.driver.find_element(By.ID, 'modalOders').find_element(By.CLASS_NAME, 'btn-close').click()
+                    break
+                except:
+                    pass
diff --git a/scrape_ecourtindia_v6/results/scraping_results.csv b/scrape_ecourtindia_v6/results/scraping_results.csv
new file mode 100644
index 0000000..35dff1a
--- /dev/null
+++ b/scrape_ecourtindia_v6/results/scraping_results.csv
@@ -0,0 +1 @@
+State,District,Complex,Establishment,Records
diff --git a/scrape_ecourtindia_v6/scrape_case_status.py b/scrape_ecourtindia_v6/scrape_case_status.py
index 2b543ba..a8891fd 100644
--- a/scrape_ecourtindia_v6/scrape_case_status.py
+++ b/scrape_ecourtindia_v6/scrape_case_status.py
@@ -1,89 +1,67 @@
-import csv
+from time import sleep
 from modules.scraper_case_status import ScraperCaseStatus
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import threading
+from tinydb import TinyDB
 
-SCRAPE_ESTABLISHMENTS = True
+db = TinyDB('db.json')
 
-class ThreadSafeCSVWriter:
-    def __init__(self, filename):
-        self.file = open(filename, 'w', newline='')
-        self.writer = csv.writer(self.file)
-        self.lock = threading.Lock()
+scraper = ScraperCaseStatus()
 
-    def writerow(self, row):
-        with self.lock:
-            self.writer.writerow(row)
+state = 'Karnataka'
+act = 'Juvenile Justice (Care and Protection of Children) Act, 2015'
 
-    def close(self):
-        self.file.close()
+scraper.close_modal()
+scraper.select('sess_state_code', state)
+sleep(1)
 
-def scrape_state_thread(state, config, csv_writer):
-    scraper = ScraperCaseStatus(config)
-    scraper.close_modal()
-    try:
-        scraper.select('sess_state_code', state)
-        for district in scraper.scrape_districts():
+for district in scraper.scrape_districts():
+    print(f'SELECTING DISTRICT {district}')
+    while True:
+        try:
+            scraper.close_modal()
             scraper.select('sess_dist_code', district)
-            for cmplx in scraper.scrape_complexes():
+            break
+        except:
+            pass
+    sleep(1)
+
+    for cmplx in scraper.scrape_complexes():
+        sleep(1)
+        print(f'SELECTING COMPLEX {cmplx}')
+        while True:
+            try:
+                scraper.close_modal()
                 scraper.select('court_complex_code', cmplx)
-                if SCRAPE_ESTABLISHMENTS:
-                    establishments = []
-                    for establishment in scraper.scrape_establishments():
-                        establishments.append(establishment)
-
-                    csv_writer.writerow([ state, district, cmplx ] + establishments)
-                else:
-                    csv_writer.writerow([ state, district, cmplx ])
-    except Exception as e:
-        print(f"Error scraping {state}: {e}")
-    finally:
-        scraper.driver.quit()
-
-def scrape_courts():
-    config = {}
-
-    m = ScraperCaseStatus(config)
-    m.close_modal()
-
-    csv_writer = ThreadSafeCSVWriter('csv/courts.csv')
-    csv_writer.writerow(['State', 'District', 'Complex'])
-
-    states = m.scrape_states()
-    m.driver.close()
-
-    with ThreadPoolExecutor(max_workers=5) as executor:
-        futures = [
-            executor.submit(scrape_state_thread, state, config, csv_writer) 
-            for state in states
-        ]
-
-        for future in as_completed(futures):
+                break
+            except:
+                pass
+        try:
+            scraper.driver.switch_to.alert.accept();
+            scraper.close_modal()
+        except:
+            pass
+
+        for establishment in scraper.scrape_establishments():
+            sleep(1)
+            print(f'SELECTING ESTABLISHMENT {establishment}')
+            while True:
+                try:
+                    scraper.close_modal()
+                    scraper.select('court_est_code', establishment)
+                    break
+                except Exception as e:
+                    print("EXCEPTION HANDLED:")
+                    print(e)
+
+            sleep(1)
+            scraper.close_modal()
+
+            sleep(1)
+            scraper.goto_acts()
             try:
-                future.result()
+                scraper.select_act(act)
+                scraper.handle_table(db)
             except Exception as e:
-                print(f"A thread encountered an error: {e}")
-
-    csv_writer.close()
-
-def scrape_orders():
-    config = {}
-
-    m = ScraperCaseStatus(config)
-    m.close_modal()
-
-    config['state'] = input('Select a state: ')
-    config['district'] = input('Select a district: ')
-    config['court_complex'] = input('Select a court complex: ')
-    config['court_establishment'] = input('Select a court establishment: ')
-    config['act'] = input('Select an act: ')
-
-    m.select_court()
-    m.goto_acts()
-    m.select_act()
-    m.handle_table()
-
-    m.driver.close()
+                    print("EXCEPTION HANDLED:")
+                    print(e)
 
-if __name__ == '__main__':
-    scrape_courts()
+scraper.driver.close()
diff --git a/scrape_ecourtindia_v6/scrape_case_status_states.py b/scrape_ecourtindia_v6/scrape_case_status_states.py
new file mode 100644
index 0000000..e75af84
--- /dev/null
+++ b/scrape_ecourtindia_v6/scrape_case_status_states.py
@@ -0,0 +1,70 @@
+import csv
+from modules.scraper_case_status import ScraperCaseStatus
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+
+SCRAPE_ESTABLISHMENTS = True
+
+class ThreadSafeCSVWriter:
+    def __init__(self, filename):
+        self.file = open(filename, 'w', newline='')
+        self.writer = csv.writer(self.file)
+        self.lock = threading.Lock()
+
+    def writerow(self, row):
+        with self.lock:
+            self.writer.writerow(row)
+
+    def close(self):
+        self.file.close()
+
+def scrape_state_thread(state, config, csv_writer):
+    scraper = ScraperCaseStatus(config)
+    scraper.close_modal()
+    try:
+        scraper.select('sess_state_code', state)
+        for district in scraper.scrape_districts():
+            scraper.select('sess_dist_code', district)
+            for cmplx in scraper.scrape_complexes():
+                scraper.select('court_complex_code', cmplx)
+                if SCRAPE_ESTABLISHMENTS:
+                    establishments = []
+                    for establishment in scraper.scrape_establishments():
+                        establishments.append(establishment)
+
+                    csv_writer.writerow([ state, district, cmplx ] + establishments)
+                else:
+                    csv_writer.writerow([ state, district, cmplx ])
+    except Exception as e:
+        print(f"Error scraping {state}: {e}")
+    finally:
+        scraper.driver.quit()
+
+def scrape_courts():
+    config = {}
+
+    m = ScraperCaseStatus(config)
+    m.close_modal()
+
+    csv_writer = ThreadSafeCSVWriter('csv/courts.csv')
+    csv_writer.writerow(['State', 'District', 'Complex'])
+
+    states = m.scrape_states()
+    m.driver.close()
+
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        futures = [
+            executor.submit(scrape_state_thread, state, config, csv_writer) 
+            for state in states
+        ]
+
+        for future in as_completed(futures):
+            try:
+                future.result()
+            except Exception as e:
+                print(f"A thread encountered an error: {e}")
+
+    csv_writer.close()
+
+if __name__ == '__main__':
+    scrape_courts()
diff --git a/scrape_ecourtindia_v6/translate_to_english.py b/scrape_ecourtindia_v6/translate_to_english.py
new file mode 100644
index 0000000..485a4b8
--- /dev/null
+++ b/scrape_ecourtindia_v6/translate_to_english.py
@@ -0,0 +1,42 @@
+from tempfile import TemporaryDirectory
+ 
+import pytesseract
+from pdf2image import convert_from_path
+from PIL import Image
+
+from tinydb import TinyDB
+ 
+language = 'hin'
+ 
+def to_english(input_file, output_file):
+    image_file_list = []
+
+    with TemporaryDirectory() as tempdir:
+        pdf_pages = convert_from_path(input_file, 500)
+
+        for page_enumeration, page in enumerate(pdf_pages, start=1):
+            filename = f"{tempdir}/page_{page_enumeration}.jpg"
+            page.save(filename, "JPEG")
+            image_file_list.append(filename)
+ 
+        with open(output_file, "a") as h:
+            for image_file in image_file_list:
+                text = str(((pytesseract.image_to_string(Image.open(image_file), lang=language))))
+ 
+                # In many PDFs, at line ending, if a word can't
+                # be written fully, a 'hyphen' is added.
+                # The rest of the word is written in the next line
+                # Eg: This is a sample text this word here GeeksF-
+                # orGeeks is half on first line, remaining on next.
+                # To remove this, we replace every '-\n' to ''.
+                text = text.replace("-\n", "")
+
+                breakpoint()
+ 
+                h.write(text)
+
+db = TinyDB('orders.json')
+entries = db.all()
+
+for entry in entries:
+    to_english(entry['filename'], f'translated/{entry["filename"][4:-4]}.txt')
diff --git a/test/.gitignore b/test/.gitignore
new file mode 100644
index 0000000..818a333
--- /dev/null
+++ b/test/.gitignore
@@ -0,0 +1,2 @@
+*.txt
+*.mp3
diff --git a/test/transcribe.py b/test/transcribe.py
new file mode 100644
index 0000000..c64f425
--- /dev/null
+++ b/test/transcribe.py
@@ -0,0 +1,14 @@
+import os
+import whisper
+
+def transcribe_audio(audio_file_path, model_path):
+    model = whisper.load_model(model_path)
+    result = model.transcribe(audio_file_path)
+    text_file_path = os.path.splitext(audio_file_path)[0] + ".txt"
+    with open(text_file_path, "w") as text_file:
+        text_file.write(result['text'])
+    
+audio_file_path = 'test.mp3'
+
+if audio_file_path is not None:
+    transcribe_audio(audio_file_path, model_path='medium')
author	Raghuram Subramani <raghus2247@gmail.com>	2025-03-31 14:30:38 +0530
committer	Raghuram Subramani <raghus2247@gmail.com>	2025-03-31 14:30:57 +0530
commit	0f188ea1e638e6abddb03d49b9209c703081b2fe (patch)
tree	cfe69bb82158fccf9eb4d5737d0c9c1603c5e1f1
parent	97d1df0cd10f9f4adc1991cc8067cc8f1d3978cf (diff)