aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRaghuram Subramani <raghus2247@gmail.com>2025-04-14 11:26:33 +0530
committerRaghuram Subramani <raghus2247@gmail.com>2025-04-14 11:26:33 +0530
commitc5d8880d6419e48b5c1450a5c1236576a47d2ac8 (patch)
tree36ed18b85b388d541d25448989c5c77fcb4b2fb2
parent12746a82d6eb50a39e8a86fbfaf11edda1ce29ee (diff)
add translate.py
-rw-r--r--scrape_ecourtindia_v6/translate/translate.py283
1 files changed, 283 insertions, 0 deletions
diff --git a/scrape_ecourtindia_v6/translate/translate.py b/scrape_ecourtindia_v6/translate/translate.py
new file mode 100644
index 0000000..3c6b53a
--- /dev/null
+++ b/scrape_ecourtindia_v6/translate/translate.py
@@ -0,0 +1,283 @@
+import re
+import fitz
+
+k2u = [
+ ('\xf1', '\u0970'), # ñ -> ॰
+ ('Q+Z', 'QZ+'), # Q+Z -> QZ+
+ ('sas', 'sa'), # sas -> sa
+ ('aa', 'a'), # aa -> a
+ (')Z', '\u0930\u094d\u0926\u094d\u0927'), # )Z -> र्द्ध
+ ('ZZ', 'Z'), # ZZ -> Z
+ ('\u2018', '"'), # ‘ -> "
+ ('\u2019', '"'), # ’ -> "
+ ('\u201c', u"'"), # “ -> '
+ ('\u201d', u"'"), # ” -> '
+ ('\xe5', '\u0966'), # å -> ०
+ ('\u0192', '\u0967'), # ƒ -> १
+ ('\u201e', '\u0968'), # „ -> २
+ ('\u2026', '\u0969'), # … -> ३
+ ('\u2020', '\u096a'), # † -> ४
+ ('\u2021', '\u096b'), # ‡ -> ५
+ ('\u02c6', '\u096c'), # ˆ -> ६
+ ('\u2030', '\u096d'), # ‰ -> ७
+ ('\u0160', '\u096e'), # Š -> ८
+ ('\u2039', '\u096f'), # ‹ -> ९
+ ('\xb6+', '\u095e\u094d'), # ¶+ -> फ़्
+ ('d+', '\u0958'), # d+ -> क़
+ ('[+k', '\u0959'), # [+k -> ख़
+ ('[+', '\u0959\u094d'), # [+ -> ख़्
+ ('x+', '\u095a'), # x+ -> ग़
+ ('T+', '\u091c\u093c\u094d'), # T+ -> ज़्
+ ('t+', '\u095b'), # t+ -> ज़
+ ('M+', '\u095c'), # M+ -> ड़
+ ('<+', '\u095d'), # <+ -> ढ़
+ ('Q+', '\u095e'), # Q+ -> फ़
+ (';+', '\u095f'), # ;+ -> य़
+ ('j+', '\u0931'), # j+ -> ऱ
+ ('u+', '\u0929'), # u+ -> ऩ
+ ('\xd9k', '\u0924\u094d\u0924'), # Ùk -> त्त
+ ('\xd9', '\u0924\u094d\u0924\u094d'), # Ù -> त्त्
+ ('\xe4', '\u0915\u094d\u0924'), # ä -> क्त
+ ('\u2013', '\u0926\u0943'), # – -> दृ
+ ('\u2014', '\u0915\u0943'), # — -> कृ
+ ('\xe9', '\u0928\u094d\u0928'), # é -> न्न
+ ('\u2122', '\u0928\u094d\u0928\u094d'), # ™ -> न्न्
+ ('=kk', '=k'), # =kk -> =k
+ ('f=k', 'f='), # f=k -> f=
+ ('\xe0', '\u0939\u094d\u0928'), # à -> ह्न
+ ('\xe1', '\u0939\u094d\u092f'), # á -> ह्य
+ ('\xe2', '\u0939\u0943'), # â -> हृ
+ ('\xe3', '\u0939\u094d\u092e'), # ã -> ह्म
+ ('\xbaz', '\u0939\u094d\u0930'), # ºz -> ह्र
+ ('\xba', '\u0939\u094d'), # º -> ह्
+ ('\xed', '\u0926\u094d\u0926'), # í -> द्द
+ ('{k', '\u0915\u094d\u0937'), # {k -> क्ष
+ ('{', '\u0915\u094d\u0937\u094d'), # { -> क्ष्
+ ('=', '\u0924\u094d\u0930'), # = -> त्र
+ ('\xab', '\u0924\u094d\u0930\u094d'), # « -> त्र्
+ ('N\xee', '\u091b\u094d\u092f'), # Nî -> छ्य
+ ('V\xee', '\u091f\u094d\u092f'), # Vî -> ट्य
+ ('B\xee', '\u0920\u094d\u092f'), # Bî -> ठ्य
+ ('M\xee', '\u0921\u094d\u092f'), # Mî -> ड्य
+ ('<\xee', '\u0922\u094d\u092f'), # <î -> ढ्य
+ ('|', '\u0926\u094d\u092f'), # | -> द्य
+ ('K', '\u091c\u094d\u091e'), # K -> ज्ञ
+ ('}', '\u0926\u094d\u0935'), # } -> द्व
+ ('J', '\u0936\u094d\u0930'), # J -> श्र
+ ('V\xaa', '\u091f\u094d\u0930'), # Vª -> ट्र
+ ('M\xaa', '\u0921\u094d\u0930'), # Mª -> ड्र
+ ('<\xaa\xaa', '\u0922\u094d\u0930'), # <ªª -> ढ्र
+ ('N\xaa', '\u091b\u094d\u0930'), # Nª -> छ्र
+ ('\xd8', '\u0915\u094d\u0930'), # Ø -> क्र
+ ('\xdd', '\u092b\u094d\u0930'), # Ý -> फ्र
+ ('nzZ', '\u0930\u094d\u0926\u094d\u0930'), # nzZ -> र्द्र
+ ('\xe6', '\u0926\u094d\u0930'), # æ -> द्र
+ ('\xe7', '\u092a\u094d\u0930'), # ç -> प्र
+ ('\xc1', '\u092a\u094d\u0930'), # Á -> प्र
+ ('xz', '\u0917\u094d\u0930'), # xz -> ग्र
+ ('#', '\u0930\u0941'), # # -> रु
+ (':', '\u0930\u0942'), # : -> रू
+ ('v\u201a', '\u0911'), # v‚ -> ऑ
+ ('vks', '\u0913'), # vks -> ओ
+ ('vkS', '\u0914'), # vkS -> औ
+ ('vk', '\u0906'), # vk -> आ
+ ('v', '\u0905'), # v -> अ
+ ('b\xb1', '\u0908\u0902'), # b± -> ईं
+ ('\xc3', '\u0908'), # Ã -> ई
+ ('bZ', '\u0908'), # bZ -> ई
+ ('b', '\u0907'), # b -> इ
+ ('m', '\u0909'), # m -> उ
+ ('\xc5', '\u090a'), # Å -> ऊ
+ (',s', '\u0910'), # ,s -> ऐ
+ (',', '\u090f'), # , -> ए
+ ('_', '\u090b'), # _ -> ऋ
+ ('\xf4', '\u0915\u094d\u0915'), # ô -> क्क
+ ('d', '\u0915'), # d -> क
+ ('Dk', '\u0915'), # Dk -> क
+ ('D', '\u0915\u094d'), # D -> क्
+ ('[k', '\u0916'), # [k -> ख
+ ('[', '\u0916\u094d'), # [ -> ख्
+ ('x', '\u0917'), # x -> ग
+ ('Xk', '\u0917'), # Xk -> ग
+ ('X', '\u0917\u094d'), # X -> ग्
+ ('\xc4', '\u0918'), # Ä -> घ
+ ('?k', '\u0918'), # ?k -> घ
+ ('?', '\u0918\u094d'), # ? -> घ्
+ ('\xb3', '\u0919'), # ³ -> ङ
+ ('pkS', '\u091a\u0948'), # pkS -> चै
+ ('p', '\u091a'), # p -> च
+ ('Pk', '\u091a'), # Pk -> च
+ ('P', '\u091a\u094d'), # P -> च्
+ ('N', '\u091b'), # N -> छ
+ ('t', '\u091c'), # t -> ज
+ ('Tk', '\u091c'), # Tk -> ज
+ ('T', '\u091c\u094d'), # T -> ज्
+ ('>', '\u091d'), # > -> झ
+ ('\xf7', '\u091d\u094d'), # ÷ -> झ्
+ ('\xa5', '\u091e'), # ¥ -> ञ
+ ('\xea', '\u091f\u094d\u091f'), # ê -> ट्ट
+ ('\xeb', '\u091f\u094d\u0920'), # ë -> ट्ठ
+ ('V', '\u091f'), # V -> ट
+ ('B', '\u0920'), # B -> ठ
+ ('\xec', '\u0921\u094d\u0921'), # ì -> ड्ड
+ ('\xef', '\u0921\u094d\u0922'), # ï -> ड्ढ
+ ('M+', '\u0921\u093c'), # M+ -> ड़
+ ('<+', '\u0922\u093c'), # <+ -> ढ़
+ ('M', '\u0921'), # M -> ड
+ ('<', '\u0922'), # < -> ढ
+ ('.k', '\u0923'), # .k -> ण
+ ('.', '\u0923\u094d'), # . -> ण्
+ ('r', '\u0924'), # r -> त
+ ('Rk', '\u0924'), # Rk -> त
+ ('R', '\u0924\u094d'), # R -> त्
+ ('Fk', '\u0925'), # Fk -> थ
+ ('F', '\u0925\u094d'), # F -> थ्
+ (')', '\u0926\u094d\u0927'), # ) -> द्ध
+ ('n', '\u0926'), # n -> द
+ ('/k', '\u0927'), # /k -> ध
+# ('\xe8k', '\u0927'), # èk -> ध
+ ('/', '\u0927\u094d'), # / -> ध्
+ ('\xcb', '\u0927\u094d'), # Ë -> ध्
+# ('\xe8', '\u0927\u094d'), # è -> ध्
+ ('\xe8', '\u0927'), # è -> ध
+ ('u', '\u0928'), # u -> न
+ ('Uk', '\u0928'), # Uk -> न
+ ('U', '\u0928\u094d'), # U -> न्
+ ('i', '\u092a'), # i -> प
+ ('Ik', '\u092a'), # Ik -> प
+ ('I', '\u092a\u094d'), # I -> प्
+ ('Q', '\u092b'), # Q -> फ
+ ('\xb6', '\u092b\u094d'), # ¶ -> फ्
+ ('c', '\u092c'), # c -> ब
+ ('Ck', '\u092c'), # Ck -> ब
+ ('C', '\u092c\u094d'), # C -> ब्
+ ('Hk', '\u092d'), # Hk -> भ
+ ('H', '\u092d\u094d'), # H -> भ्
+ ('e', '\u092e'), # e -> म
+ ('Ek', '\u092e'), # Ek -> म
+ ('E', '\u092e\u094d'), # E -> म्
+ (';', '\u092f'), # ; -> य
+ ('\xb8', '\u092f\u094d'), # ¸ -> य्
+ ('j', '\u0930'), # j -> र
+ ('y', '\u0932'), # y -> ल
+ ('Yk', '\u0932'), # Yk -> ल
+ ('Y', '\u0932\u094d'), # Y -> ल्
+ ('G', '\u0933'), # G -> ळ
+ ('o', '\u0935'), # o -> व
+ ('Ok', '\u0935'), # Ok -> व
+ ('O', '\u0935\u094d'), # O -> व्
+ (u"'k", '\u0936'), # 'k -> श
+ (u"'", '\u0936\u094d'), # ' -> श्
+ ('"k', '\u0937'), # "k -> ष
+ ('"', '\u0937\u094d'), # " -> ष्
+ ('l', '\u0938'), # l -> स
+ ('Lk', '\u0938'), # Lk -> स
+ ('L', '\u0938\u094d'), # L -> स्
+ ('g', '\u0939'), # g -> ह
+ ('\xc8', '\u0940\u0902'), # È -> ीं
+ ('saz', '\u094d\u0930\u0947\u0902'), # saz -> ्रें
+ ('z', '\u094d\u0930'), # z -> ्र
+ ('\xcc', '\u0926\u094d\u0926'), # Ì -> द्द
+ ('\xcd', '\u091f\u094d\u091f'), # Í -> ट्ट
+ ('\xce', '\u091f\u094d\u0920'), # Î -> ट्ठ
+ ('\xcf', '\u0921\u094d\u0921'), # Ï -> ड्ड
+ ('\xd1', '\u0915\u0943'), # Ñ -> कृ
+ ('\xd2', '\u092d'), # Ò -> भ
+ ('\xd3', '\u094d\u092f'), # Ó -> ्य
+ ('\xd4', '\u0921\u094d\u0922'), # Ô -> ड्ढ
+ ('\xd6', '\u091d\u094d'), # Ö -> झ्
+ ('\xd8', '\u0915\u094d\u0930'), # Ø -> क्र
+ ('\xd9', '\u0924\u094d\u0924\u094d'), # Ù -> त्त्
+ ('\xdck', '\u0936'), # Ük -> श
+ ('\xdc', '\u0936\u094d'), # Ü -> श्
+ ('\u201a', '\u0949'), # ‚ -> ॉ
+ ('kas', '\u094b\u0902'), # kas -> ों
+ ('ks', '\u094b'), # ks -> ो
+ ('kS', '\u094c'), # kS -> ौ
+ ('\xa1k', '\u093e\u0901'), # ¡k -> ाँ'
+ ('ak', 'k\u0902'), # ak -> k + ं
+ ('k', '\u093e'), # k -> ा
+ ('ah', '\u0940\u0902'), # ah -> ीं
+ ('h', '\u0940'), # h -> ी
+ ('aq', '\u0941\u0902'), # aq -> ुं
+ ('q', '\u0941'), # q -> ु
+ ('aw', '\u0942\u0902'), # aw -> ूं
+ ('\xa1w', '\u0942\u0901'), # ¡w -> ूँ
+ ('w', '\u0942'), # w -> ू
+ ('`', '\u0943'), # ` -> ृ
+ ('\u0300', '\u0943'), # ̀ -> ृ
+ ('as', '\u0947\u0902'), # as -> ें
+ ('\xb1s', 's\xb1'), # ±s -> s±
+ ('s', '\u0947'), # s -> े
+ ('aS', '\u0948\u0902'), # aS -> ैं
+ ('S', '\u0948'), # S -> ै
+ ('a\xaa', '\u094d\u0930\u0902'), # aª -> ्र + ं
+ ('\xaa', '\u094d\u0930'), # ª -> ्र
+ ('fa', '\u0902f'), # fa -> ं + f
+ ('a', '\u0902'), # a -> ं
+ ('\xa1', '\u0901'), # ¡ -> ँ
+ ('%', ':'), # % -> :
+ ('W', '\u0945'), # W -> ॅ
+ ('\u2022', '\u093d'), # • -> ऽ
+ ('\xb7', '\u093d'), # · -> ऽ
+ ('\u2219', '\u093d'), # ∙ -> ऽ
+ ('\xb7', '\u093d'), # · -> ऽ
+ ('~j', '\u094d\u0930'), # ~j -> ्र
+ ('~', '\u094d'), # ~ -> ्
+ ('\\', '?'), # \ -> ?
+ ('+', '\u093c'), # + -> ़
+ ('^', '\u2018'), # ^ -> ‘
+ ('*', '\u2019'), # * -> ’
+ ('\xde', '\u201c'), # Þ -> “
+ ('\xdf', '\u201d'), # ß -> ”
+ ('(', ';'), # ( -> ;
+ ('\xbc', '('), # ¼ -> (
+ ('\xbd', ')'), # ½ -> )
+ ('\xbf', '{'), # ¿ -> {
+ ('\xc0', '}'), # À -> }
+ ('\xbe', '='), # ¾ -> =
+ ('A', '\u0964'), # A -> ।
+ ('-', '.'), # - -> .
+ ('&', '-'), # & -> -
+ ('&', '\xb5'), # & -> µ
+ ('\u03bc', '-'), # μ -> -
+ ('\u0152', '\u0970'), # Œ -> ॰
+ (']', ','), # ] -> ,
+ ('~ ', '\u094d '), # ~ -> ्
+ ('@', '/'), # @ -> /
+ ('\xae', '\u0948\u0902'), # ® -> ैं
+# ('%', '\u0903'), # % -> ः
+# (' \u0903', ':'), # ः -> :
+# ('\xc7', '\u093f\u0902'), # Ç -> िं
+# ('\xca', '\u0940Z'), # Ê -> ीZ
+# ('Z', '\u0930\u094d'), # Z -> र्
+# ('f', '\u093f'), # f -> ि
+# ('\xb1', 'Z\u0902'), # ± -> Zं
+# ('\xc6', '\u0930\u094d\u093f'), # Æ -> र्ि
+# ('\xc9', '\u0930\u094d\u093f\u0902'), # É -> र्ि'
+]
+
+def convert_text(text):
+ for src, tgt in k2u:
+ text = text.replace(src, tgt)
+ return text
+
+def extract_text_from_pdf(pdf_path):
+ doc = fitz.open(pdf_path)
+ full_text = ""
+ for page in doc:
+ full_text += page.get_text()
+ return full_text
+
+def convert_pdf(pdf_path, output_path):
+ text = extract_text_from_pdf(pdf_path)
+ converted = convert_text(text)
+ with open(output_path, 'w', encoding='utf-8') as f:
+ f.write(converted)
+
+if __name__ == "__main__":
+ import sys
+ if len(sys.argv) != 3:
+ print(f"Usage: python {sys.argv[0]} input.pdf output.txt")
+ else:
+ convert_pdf(sys.argv[1], sys.argv[2])