diff options
author | Raghuram Subramani <raghus2247@gmail.com> | 2025-04-14 11:26:33 +0530 |
---|---|---|
committer | Raghuram Subramani <raghus2247@gmail.com> | 2025-04-14 11:26:33 +0530 |
commit | c5d8880d6419e48b5c1450a5c1236576a47d2ac8 (patch) | |
tree | 36ed18b85b388d541d25448989c5c77fcb4b2fb2 | |
parent | 12746a82d6eb50a39e8a86fbfaf11edda1ce29ee (diff) |
add translate.py
-rw-r--r-- | scrape_ecourtindia_v6/translate/translate.py | 283 |
1 files changed, 283 insertions, 0 deletions
diff --git a/scrape_ecourtindia_v6/translate/translate.py b/scrape_ecourtindia_v6/translate/translate.py new file mode 100644 index 0000000..3c6b53a --- /dev/null +++ b/scrape_ecourtindia_v6/translate/translate.py @@ -0,0 +1,283 @@ +import re +import fitz + +k2u = [ + ('\xf1', '\u0970'), # ñ -> ॰ + ('Q+Z', 'QZ+'), # Q+Z -> QZ+ + ('sas', 'sa'), # sas -> sa + ('aa', 'a'), # aa -> a + (')Z', '\u0930\u094d\u0926\u094d\u0927'), # )Z -> र्द्ध + ('ZZ', 'Z'), # ZZ -> Z + ('\u2018', '"'), # ‘ -> " + ('\u2019', '"'), # ’ -> " + ('\u201c', u"'"), # “ -> ' + ('\u201d', u"'"), # ” -> ' + ('\xe5', '\u0966'), # å -> ० + ('\u0192', '\u0967'), # ƒ -> १ + ('\u201e', '\u0968'), # „ -> २ + ('\u2026', '\u0969'), # … -> ३ + ('\u2020', '\u096a'), # † -> ४ + ('\u2021', '\u096b'), # ‡ -> ५ + ('\u02c6', '\u096c'), # ˆ -> ६ + ('\u2030', '\u096d'), # ‰ -> ७ + ('\u0160', '\u096e'), # Š -> ८ + ('\u2039', '\u096f'), # ‹ -> ९ + ('\xb6+', '\u095e\u094d'), # ¶+ -> फ़् + ('d+', '\u0958'), # d+ -> क़ + ('[+k', '\u0959'), # [+k -> ख़ + ('[+', '\u0959\u094d'), # [+ -> ख़् + ('x+', '\u095a'), # x+ -> ग़ + ('T+', '\u091c\u093c\u094d'), # T+ -> ज़् + ('t+', '\u095b'), # t+ -> ज़ + ('M+', '\u095c'), # M+ -> ड़ + ('<+', '\u095d'), # <+ -> ढ़ + ('Q+', '\u095e'), # Q+ -> फ़ + (';+', '\u095f'), # ;+ -> य़ + ('j+', '\u0931'), # j+ -> ऱ + ('u+', '\u0929'), # u+ -> ऩ + ('\xd9k', '\u0924\u094d\u0924'), # Ùk -> त्त + ('\xd9', '\u0924\u094d\u0924\u094d'), # Ù -> त्त् + ('\xe4', '\u0915\u094d\u0924'), # ä -> क्त + ('\u2013', '\u0926\u0943'), # – -> दृ + ('\u2014', '\u0915\u0943'), # — -> कृ + ('\xe9', '\u0928\u094d\u0928'), # é -> न्न + ('\u2122', '\u0928\u094d\u0928\u094d'), # ™ -> न्न् + ('=kk', '=k'), # =kk -> =k + ('f=k', 'f='), # f=k -> f= + ('\xe0', '\u0939\u094d\u0928'), # à -> ह्न + ('\xe1', '\u0939\u094d\u092f'), # á -> ह्य + ('\xe2', '\u0939\u0943'), # â -> हृ + ('\xe3', '\u0939\u094d\u092e'), # ã -> ह्म + ('\xbaz', '\u0939\u094d\u0930'), # ºz -> ह्र + ('\xba', '\u0939\u094d'), # º -> ह् + ('\xed', '\u0926\u094d\u0926'), # í -> द्द + ('{k', '\u0915\u094d\u0937'), # {k -> क्ष + ('{', '\u0915\u094d\u0937\u094d'), # { -> क्ष् + ('=', '\u0924\u094d\u0930'), # = -> त्र + ('\xab', '\u0924\u094d\u0930\u094d'), # « -> त्र् + ('N\xee', '\u091b\u094d\u092f'), # Nî -> छ्य + ('V\xee', '\u091f\u094d\u092f'), # Vî -> ट्य + ('B\xee', '\u0920\u094d\u092f'), # Bî -> ठ्य + ('M\xee', '\u0921\u094d\u092f'), # Mî -> ड्य + ('<\xee', '\u0922\u094d\u092f'), # <î -> ढ्य + ('|', '\u0926\u094d\u092f'), # | -> द्य + ('K', '\u091c\u094d\u091e'), # K -> ज्ञ + ('}', '\u0926\u094d\u0935'), # } -> द्व + ('J', '\u0936\u094d\u0930'), # J -> श्र + ('V\xaa', '\u091f\u094d\u0930'), # Vª -> ट्र + ('M\xaa', '\u0921\u094d\u0930'), # Mª -> ड्र + ('<\xaa\xaa', '\u0922\u094d\u0930'), # <ªª -> ढ्र + ('N\xaa', '\u091b\u094d\u0930'), # Nª -> छ्र + ('\xd8', '\u0915\u094d\u0930'), # Ø -> क्र + ('\xdd', '\u092b\u094d\u0930'), # Ý -> फ्र + ('nzZ', '\u0930\u094d\u0926\u094d\u0930'), # nzZ -> र्द्र + ('\xe6', '\u0926\u094d\u0930'), # æ -> द्र + ('\xe7', '\u092a\u094d\u0930'), # ç -> प्र + ('\xc1', '\u092a\u094d\u0930'), # Á -> प्र + ('xz', '\u0917\u094d\u0930'), # xz -> ग्र + ('#', '\u0930\u0941'), # # -> रु + (':', '\u0930\u0942'), # : -> रू + ('v\u201a', '\u0911'), # v‚ -> ऑ + ('vks', '\u0913'), # vks -> ओ + ('vkS', '\u0914'), # vkS -> औ + ('vk', '\u0906'), # vk -> आ + ('v', '\u0905'), # v -> अ + ('b\xb1', '\u0908\u0902'), # b± -> ईं + ('\xc3', '\u0908'), # à -> ई + ('bZ', '\u0908'), # bZ -> ई + ('b', '\u0907'), # b -> इ + ('m', '\u0909'), # m -> उ + ('\xc5', '\u090a'), # Å -> ऊ + (',s', '\u0910'), # ,s -> ऐ + (',', '\u090f'), # , -> ए + ('_', '\u090b'), # _ -> ऋ + ('\xf4', '\u0915\u094d\u0915'), # ô -> क्क + ('d', '\u0915'), # d -> क + ('Dk', '\u0915'), # Dk -> क + ('D', '\u0915\u094d'), # D -> क् + ('[k', '\u0916'), # [k -> ख + ('[', '\u0916\u094d'), # [ -> ख् + ('x', '\u0917'), # x -> ग + ('Xk', '\u0917'), # Xk -> ग + ('X', '\u0917\u094d'), # X -> ग् + ('\xc4', '\u0918'), # Ä -> घ + ('?k', '\u0918'), # ?k -> घ + ('?', '\u0918\u094d'), # ? -> घ् + ('\xb3', '\u0919'), # ³ -> ङ + ('pkS', '\u091a\u0948'), # pkS -> चै + ('p', '\u091a'), # p -> च + ('Pk', '\u091a'), # Pk -> च + ('P', '\u091a\u094d'), # P -> च् + ('N', '\u091b'), # N -> छ + ('t', '\u091c'), # t -> ज + ('Tk', '\u091c'), # Tk -> ज + ('T', '\u091c\u094d'), # T -> ज् + ('>', '\u091d'), # > -> झ + ('\xf7', '\u091d\u094d'), # ÷ -> झ् + ('\xa5', '\u091e'), # ¥ -> ञ + ('\xea', '\u091f\u094d\u091f'), # ê -> ट्ट + ('\xeb', '\u091f\u094d\u0920'), # ë -> ट्ठ + ('V', '\u091f'), # V -> ट + ('B', '\u0920'), # B -> ठ + ('\xec', '\u0921\u094d\u0921'), # ì -> ड्ड + ('\xef', '\u0921\u094d\u0922'), # ï -> ड्ढ + ('M+', '\u0921\u093c'), # M+ -> ड़ + ('<+', '\u0922\u093c'), # <+ -> ढ़ + ('M', '\u0921'), # M -> ड + ('<', '\u0922'), # < -> ढ + ('.k', '\u0923'), # .k -> ण + ('.', '\u0923\u094d'), # . -> ण् + ('r', '\u0924'), # r -> त + ('Rk', '\u0924'), # Rk -> त + ('R', '\u0924\u094d'), # R -> त् + ('Fk', '\u0925'), # Fk -> थ + ('F', '\u0925\u094d'), # F -> थ् + (')', '\u0926\u094d\u0927'), # ) -> द्ध + ('n', '\u0926'), # n -> द + ('/k', '\u0927'), # /k -> ध +# ('\xe8k', '\u0927'), # èk -> ध + ('/', '\u0927\u094d'), # / -> ध् + ('\xcb', '\u0927\u094d'), # Ë -> ध् +# ('\xe8', '\u0927\u094d'), # è -> ध् + ('\xe8', '\u0927'), # è -> ध + ('u', '\u0928'), # u -> न + ('Uk', '\u0928'), # Uk -> न + ('U', '\u0928\u094d'), # U -> न् + ('i', '\u092a'), # i -> प + ('Ik', '\u092a'), # Ik -> प + ('I', '\u092a\u094d'), # I -> प् + ('Q', '\u092b'), # Q -> फ + ('\xb6', '\u092b\u094d'), # ¶ -> फ् + ('c', '\u092c'), # c -> ब + ('Ck', '\u092c'), # Ck -> ब + ('C', '\u092c\u094d'), # C -> ब् + ('Hk', '\u092d'), # Hk -> भ + ('H', '\u092d\u094d'), # H -> भ् + ('e', '\u092e'), # e -> म + ('Ek', '\u092e'), # Ek -> म + ('E', '\u092e\u094d'), # E -> म् + (';', '\u092f'), # ; -> य + ('\xb8', '\u092f\u094d'), # ¸ -> य् + ('j', '\u0930'), # j -> र + ('y', '\u0932'), # y -> ल + ('Yk', '\u0932'), # Yk -> ल + ('Y', '\u0932\u094d'), # Y -> ल् + ('G', '\u0933'), # G -> ळ + ('o', '\u0935'), # o -> व + ('Ok', '\u0935'), # Ok -> व + ('O', '\u0935\u094d'), # O -> व् + (u"'k", '\u0936'), # 'k -> श + (u"'", '\u0936\u094d'), # ' -> श् + ('"k', '\u0937'), # "k -> ष + ('"', '\u0937\u094d'), # " -> ष् + ('l', '\u0938'), # l -> स + ('Lk', '\u0938'), # Lk -> स + ('L', '\u0938\u094d'), # L -> स् + ('g', '\u0939'), # g -> ह + ('\xc8', '\u0940\u0902'), # È -> ीं + ('saz', '\u094d\u0930\u0947\u0902'), # saz -> ्रें + ('z', '\u094d\u0930'), # z -> ्र + ('\xcc', '\u0926\u094d\u0926'), # Ì -> द्द + ('\xcd', '\u091f\u094d\u091f'), # Í -> ट्ट + ('\xce', '\u091f\u094d\u0920'), # Î -> ट्ठ + ('\xcf', '\u0921\u094d\u0921'), # Ï -> ड्ड + ('\xd1', '\u0915\u0943'), # Ñ -> कृ + ('\xd2', '\u092d'), # Ò -> भ + ('\xd3', '\u094d\u092f'), # Ó -> ्य + ('\xd4', '\u0921\u094d\u0922'), # Ô -> ड्ढ + ('\xd6', '\u091d\u094d'), # Ö -> झ् + ('\xd8', '\u0915\u094d\u0930'), # Ø -> क्र + ('\xd9', '\u0924\u094d\u0924\u094d'), # Ù -> त्त् + ('\xdck', '\u0936'), # Ük -> श + ('\xdc', '\u0936\u094d'), # Ü -> श् + ('\u201a', '\u0949'), # ‚ -> ॉ + ('kas', '\u094b\u0902'), # kas -> ों + ('ks', '\u094b'), # ks -> ो + ('kS', '\u094c'), # kS -> ौ + ('\xa1k', '\u093e\u0901'), # ¡k -> ाँ' + ('ak', 'k\u0902'), # ak -> k + ं + ('k', '\u093e'), # k -> ा + ('ah', '\u0940\u0902'), # ah -> ीं + ('h', '\u0940'), # h -> ी + ('aq', '\u0941\u0902'), # aq -> ुं + ('q', '\u0941'), # q -> ु + ('aw', '\u0942\u0902'), # aw -> ूं + ('\xa1w', '\u0942\u0901'), # ¡w -> ूँ + ('w', '\u0942'), # w -> ू + ('`', '\u0943'), # ` -> ृ + ('\u0300', '\u0943'), # ̀ -> ृ + ('as', '\u0947\u0902'), # as -> ें + ('\xb1s', 's\xb1'), # ±s -> s± + ('s', '\u0947'), # s -> े + ('aS', '\u0948\u0902'), # aS -> ैं + ('S', '\u0948'), # S -> ै + ('a\xaa', '\u094d\u0930\u0902'), # aª -> ्र + ं + ('\xaa', '\u094d\u0930'), # ª -> ्र + ('fa', '\u0902f'), # fa -> ं + f + ('a', '\u0902'), # a -> ं + ('\xa1', '\u0901'), # ¡ -> ँ + ('%', ':'), # % -> : + ('W', '\u0945'), # W -> ॅ + ('\u2022', '\u093d'), # • -> ऽ + ('\xb7', '\u093d'), # · -> ऽ + ('\u2219', '\u093d'), # ∙ -> ऽ + ('\xb7', '\u093d'), # · -> ऽ + ('~j', '\u094d\u0930'), # ~j -> ्र + ('~', '\u094d'), # ~ -> ् + ('\\', '?'), # \ -> ? + ('+', '\u093c'), # + -> ़ + ('^', '\u2018'), # ^ -> ‘ + ('*', '\u2019'), # * -> ’ + ('\xde', '\u201c'), # Þ -> “ + ('\xdf', '\u201d'), # ß -> ” + ('(', ';'), # ( -> ; + ('\xbc', '('), # ¼ -> ( + ('\xbd', ')'), # ½ -> ) + ('\xbf', '{'), # ¿ -> { + ('\xc0', '}'), # À -> } + ('\xbe', '='), # ¾ -> = + ('A', '\u0964'), # A -> । + ('-', '.'), # - -> . + ('&', '-'), # & -> - + ('&', '\xb5'), # & -> µ + ('\u03bc', '-'), # μ -> - + ('\u0152', '\u0970'), # Œ -> ॰ + (']', ','), # ] -> , + ('~ ', '\u094d '), # ~ -> ् + ('@', '/'), # @ -> / + ('\xae', '\u0948\u0902'), # ® -> ैं +# ('%', '\u0903'), # % -> ः +# (' \u0903', ':'), # ः -> : +# ('\xc7', '\u093f\u0902'), # Ç -> िं +# ('\xca', '\u0940Z'), # Ê -> ीZ +# ('Z', '\u0930\u094d'), # Z -> र् +# ('f', '\u093f'), # f -> ि +# ('\xb1', 'Z\u0902'), # ± -> Zं +# ('\xc6', '\u0930\u094d\u093f'), # Æ -> र्ि +# ('\xc9', '\u0930\u094d\u093f\u0902'), # É -> र्ि' +] + +def convert_text(text): + for src, tgt in k2u: + text = text.replace(src, tgt) + return text + +def extract_text_from_pdf(pdf_path): + doc = fitz.open(pdf_path) + full_text = "" + for page in doc: + full_text += page.get_text() + return full_text + +def convert_pdf(pdf_path, output_path): + text = extract_text_from_pdf(pdf_path) + converted = convert_text(text) + with open(output_path, 'w', encoding='utf-8') as f: + f.write(converted) + +if __name__ == "__main__": + import sys + if len(sys.argv) != 3: + print(f"Usage: python {sys.argv[0]} input.pdf output.txt") + else: + convert_pdf(sys.argv[1], sys.argv[2]) |