aboutsummaryrefslogtreecommitdiff
path: root/scrape_ecourtindia_v6/search_for_words.py
diff options
context:
space:
mode:
Diffstat (limited to 'scrape_ecourtindia_v6/search_for_words.py')
-rw-r--r--scrape_ecourtindia_v6/search_for_words.py109
1 files changed, 109 insertions, 0 deletions
diff --git a/scrape_ecourtindia_v6/search_for_words.py b/scrape_ecourtindia_v6/search_for_words.py
new file mode 100644
index 0000000..effcea9
--- /dev/null
+++ b/scrape_ecourtindia_v6/search_for_words.py
@@ -0,0 +1,109 @@
+import os
+import csv
+import re
+import argostranslate.translate
+
+# Load Argos Translate model (assumes it's already installed)
+installed_languages = argostranslate.translate.load_installed_languages()
+hi_lang = next(filter(lambda x: x.code == "hi", installed_languages))
+en_lang = next(filter(lambda x: x.code == "en", installed_languages))
+translator = hi_lang.get_translation(en_lang)
+
+# Hindi phrases to search
+phrases = [
+ "किशोर",
+ "किशोर न्यायालय",
+ "बोर्ड",
+ "प्रारंभिक आकलन",
+ "प्रारंभिक निर्धारण",
+ "बालक"
+]
+
+main_phrases = ["किशोर", "किशोर न्यायालय"]
+
+input_dir = "txt"
+output_csv_hindi = "output_hindi.csv"
+output_csv_english = "output_english.csv"
+base_url = "https://aarch.compromyse.xyz:8000/txt/"
+
+# Extract up to 10 snippets for a phrase
+def extract_snippets(text, phrase, window=10, max_count=10):
+ words = text.split()
+ snippets = []
+ for i, word in enumerate(words):
+ if phrase in word:
+ start = max(0, i - window)
+ end = min(len(words), i + window + 1)
+ snippet = ' '.join(words[start:end])
+ snippets.append(snippet)
+ if len(snippets) >= max_count:
+ break
+ return snippets
+
+# CSV header
+header = ["File", "File URL"]
+for phrase in phrases:
+ header.append(f"{phrase} Present")
+ if phrase in main_phrases:
+ for i in range(1, 11):
+ header.append(f"{phrase} Snippet {i}")
+ else:
+ header.append(f"{phrase} Snippet")
+
+# Process files
+results = []
+for filename in os.listdir(input_dir):
+ if filename.endswith(".txt"):
+ filepath = os.path.join(input_dir, filename)
+ with open(filepath, 'r', encoding='utf-8') as f:
+ text = f.read()
+ file_url = base_url + filename
+ row = [filename, file_url]
+
+ for phrase in phrases:
+ found = phrase in text
+ row.append("Yes" if found else "No")
+
+ if found:
+ snippets = extract_snippets(text, phrase, max_count=10)
+ if phrase in main_phrases:
+ row.extend(snippets + [""] * (10 - len(snippets)))
+ else:
+ row.append(snippets[0] if snippets else "")
+ else:
+ if phrase in main_phrases:
+ row.extend([""] * 10)
+ else:
+ row.append("")
+ results.append(row)
+
+# Write Hindi CSV
+with open(output_csv_hindi, 'w', encoding='utf-8-sig', newline='') as f:
+ writer = csv.writer(f)
+ writer.writerow(header)
+ writer.writerows(results)
+
+# Translate header
+translated_header = [translator.translate(cell) if re.search(r'[\u0900-\u097F]', cell) else cell for cell in header]
+
+# Translate rows
+translated_rows = [translated_header]
+for row in results:
+ translated_row = []
+ for cell in row:
+ try:
+ if re.search(r'[\u0900-\u097F]', cell): # Only translate if Hindi detected
+ translated_row.append(translator.translate(cell))
+ else:
+ translated_row.append(cell)
+ except:
+ translated_row.append(cell)
+ translated_rows.append(translated_row)
+
+# Write English CSV
+with open(output_csv_english, 'w', encoding='utf-8-sig', newline='') as f:
+ writer = csv.writer(f)
+ writer.writerows(translated_rows)
+
+print(f"✅ Hindi CSV saved to: {output_csv_hindi}")
+print(f"✅ English CSV saved to: {output_csv_english}")