Unlock 30% off on Manual Testing Annual Plans this Holiday Season.

Copied to Clipboard!

How to use is_wiki_word method in Nose

Best Python code snippet using nose

template_odt.py

Source:template_odt.py

...757    def _expand_links(self, matches):758        (source, label, external) = self._process_link(matches)759        if(source.startswith("#")):760            temp = source[1:]761            if(self.m_engine.is_wiki_word(temp)):762                return temp763        # DEBUG BRAD: This is a temporary hack to get links of the format -> to work764        source = re.sub(".*?#(.*)", "\\1", source)765        label = re.sub(".*?#(.*)", "\\1", label)766        #print "SOURCE = %s, LABEL = %s" % (source, label)767        #source.replace("-&amp;gt;", "")768        #label = label.replace("-&gt;", "")769        # Unconvert any XML in case it has already been770        # converted by format_text()771        label = self.unxmlize(label)772        # Now make the label XML safe again773        label = self.xmlize(label)774        if(source[0:4] == "http" or external == True):775            if(source[0:4] != "http"):...

build_union_db.py

Source:build_union_db.py

1#! /usr/bin/python32# -*- coding: utf-8 -*-3#--------------------------------------------------------------------------------------------------4# Script to build a union database by merging TSV dictionaries5#6# Usage:7#   build_union_db.py [--output str] [--core str] [--gross str] [--top str] [--slim str]8#     [--phrase_prob str] [--tran_prob str] [--tran_aux str] [--tran_aux_last str]9#     [--rev_prob str] [--cooc_prob str] [--aoa str] [--keyword str] [--min_prob str]10#     [--quiet] inputs...11#   (An input specified as "label:tsv_file".12#13# Example:14#   ./build_union_db.py --output union-body.tkh \15#     --phrase_prob enwiki-phrase-prob.tkh --tran_prob tran-prob.tkh \16#     --tran_aux dict1.tsv,dict2.tsv --rev_prob jawiki-word-prob.tkh \17#     --cooc_prob enwiki-cooc-prob.tkh --min_prob we:0.00001 \18#     wj:wiktionary-ja.tsv wn:wordnet.tsv we:wiktionary-en.tsv19#20# Copyright 2020 Google LLC21# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file22# except in compliance with the License.  You may obtain a copy of the License at23#     https://www.apache.org/licenses/LICENSE-2.024# Unless required by applicable law or agreed to in writing, software distributed under the25# License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,26# either express or implied.  See the License for the specific language governing permissions27# and limitations under the License.28#--------------------------------------------------------------------------------------------------29import collections30import json31import logging32import math33import operator34import os35import regex36import sys37import time38import tkrzw39import tkrzw_dict40import tkrzw_pron_util41import tkrzw_tokenizer42import unicodedata43logger = tkrzw_dict.GetLogger()44poses = ("noun", "verb", "adjective", "adverb",45         "pronoun", "auxverb", "preposition", "determiner", "article",46         "interjection", "conjunction", "prefix", "suffix",47         "abbreviation", "phrase", "misc")48inflection_names = ("noun_plural","verb_singular", "verb_present_participle",49                   "verb_past", "verb_past_participle",50                   "adjective_comparative", "adjective_superlative",51                   "adverb_comparative", "adverb_superlative")52etymology_names = ("etymology_prefix", "etymology_core", "etymology_suffix")53top_names = ("pronunciation",) + inflection_names + etymology_names54rel_weights = {"synonym": 1.0,55               "hypernym": 0.9,56               "hyponym": 0.8,57               "antonym": 0.2,58               "derivative": 0.7,59               "relation": 0.5}60noun_suffixes = [61  "es", "s", "ment", "age", "ics", "ness", "ity", "ism", "or", "er", "ist", "t", "pt", "th",62  "ian", "ee", "tion", "sion", "ty", "ance", "ence", "ency", "cy", "ry", "ary", "ery", "ory",63  "al", "age", "dom", "hood", "ship", "nomy", "ing", "ication", "icator", "ce", "se", "son",64  "iation", "ant", "faction", "ture", "sure", "nance", "y", "ess",65]66verb_suffixes = [67  "ify", "en", "ize", "ise", "fy", "ate", "age", "e",68]69adjective_suffixes = [70  "some", "able", "ible", "ic", "ical", "ial", "ive", "ful", "less", "ly", "ous", "y",71  "tic", "ine", "ised", "ing", "ed", "ish", "al", "ual", "icable", "er", "est", "ent", "ific",72  "ative", "tative", "ant", "ary",73]74adverb_suffixes = [75  "ly",76]77particles = {78  "aback", "about", "above", "abroad", "across", "after", "against", "ahead", "along",79  "amid", "among", "apart", "around", "as", "at", "away", "back", "before", "behind",80  "below", "beneath", "between", "beside", "beyond", "by", "despite", "during", "down",81  "except", "for", "forth", "from", "in", "inside", "into", "near", "of", "off", "on",82  "onto", "out", "outside", "over", "per", "re", "since", "than", "through", "throughout",83  "till", "to", "together", "toward", "under", "until", "up", "upon", "with", "within",84  "without", "via",85}86misc_stop_words = {87  "the", "a", "an", "I", "my", "me", "mine", "you", "your", "yours", "he", "his", "him",88  "she", "her", "hers", "it", "its", "they", "their", "them", "theirs",89  "we", "our", "us", "ours", "some", "any", "one", "someone", "something",90  "myself", "yourself", "yourselves", "himself", "herself", "itself", "themselves",91  "who", "whom", "whose", "what", "where", "when", "why", "how", "and", "but", "not", "no",92  "never", "ever", "time", "place", "people", "person", "this", "these", "that", "those",93  "other", "another", "yes", "thou",94  "back", "much", "many", "more", "most", "good", "well", "better", "best", "all",95}96wiki_stop_words = {97  "wikipedia", "encyclopedia", "page", "pages", "edit", "edits", "comment", "comments",98}99no_parents = {100  "number", "ground", "red", "happen", "letter", "monitor", "feed", "winter", "brake",101  "partner", "sister", "environment", "moment", "gun", "shower", "trigger", "wound", "bound",102  "weed", "saw", "copper", "buffer", "lump", "wary", "stove", "doctor", "hinder",103  "tower", "poetry", "parity", "fell", "lay", "bit", "drug", "grass", "shore", "notice",104  "butter", "slang", "grope", "feces", "left", "former", "found", "every", "scheme",105  "evening", "architecture", "hat", "slice", "bite", "tender", "bully", "translate",106  "fence", "liver", "special", "specific", "species", "statistics", "mathematics", "caution",107  "span", "fleet", "language", "gripe", "dribble", "total", "error", "option", "important",108  "shine", "dental", "irony", "transplant", "chemistry", "physics", "grocery", "grade",109  "gutter", "dove", "weary", "queer", "shove", "buggy", "twine", "tier", "rung", "spat",110  "pang", "jibe", "pent", "lode", "gelt", "plant", "plane", "pants", "craze", "grove",111  "downy", "musty", "mangy", "moped", "caper", "balmy", "tinny", "induce", "treaty",112  "chili", "chilli", "chile", "castor", "landry", "start", "baby", "means", "transfer",113  "interior", "exterior", "rabbit", "stripe", "fairy", "shunt", "clove", "abode", "bends",114  "molt", "holler", "feudal", "bounce", "livery", "wan", "sod", "dug", "het", "gat",115  "cover", "book", "cause", "quality", "process", "provide", "entry", "specify", "morning",116  "guarantee", "listen", "identity", "clone", "impress", "belly", "mansion",117}118force_parents = {119  "upwards": "upward", "towards": "toward", "identify": "identity", "guaranty": "guarantee",120  "advice": "advise", "device": "devise", "practice": "practise", "morn": "morning",121  "approximately": "approximate", "invocation": "invoke", "spec": "specify",122  "prisoner": "prison", "emission": "emit", "omission": "omit", "transmission": "transmit",123  "fission": "fissure", "competitive": "compete", "competitor": "compete",124  "conservative": "conserve", "pronunciation": "pronounce", "revelation": "reveal",125  "possession": "possess", "schema": "scheme", "further": "far", "farther": "far",126  "conjunction": "conjunct", "conjunctive": "conjunct", "location": "locate",127  "conjugation": "conjugate", "conjugative": "conjugate", "installation": "install",128  "translation": "translate", "formation": "form", "variation": "vary",129  "importance": "important", "innovative": "innovate", "bated": "bate",130  "chemist": "chemistry", "chemical": "chemistry", "chem": "chemistry",131  "architect": "architecture", "grocer": "grocery", "critic": "critique",132  "chilly": "chill", "launder": "laundry", "tension": "tense", "revolution": "revolve",133  "sensitive": "sense", "mutation": "mutate", "mutant": "mutate", "fated": "fate",134  "apery": "ape", "dingy": "dinge", "precession": "precess", "expertise": "expert",135  "dramatic": "drama", "pic": "picture", "tragic": "tragedy", "manse": "mansion",136  "administrate": "administer", "administrative": "administrate", "inquiry": "inquire",137  "administration": "administrate", "administrator": "administrate", "diplomat": "diplomacy",138  "federal": "federation", "analysis": "analyze", "emphasis": "emphasize",139  "chlorine": "chloride", "recognition": "recognize", "opposite": "oppose", "opponent": "oppose",140  "response": "respond", "tolerant": "tolerate", "remainder": "remain",141  "differential": "different", "differentiate": "different", "failure": "fail",142  "explosive": "explode", "civilization": "civil", "civilize": "civil",143  "success": "succeed", "application": "apply", "therapeutic": "therapy",144  "medical": "medicine", "beneficial": "benefit", "pianist": "piano",145  "blonde": "blond", "classification": "classify", "classify": "class",146  "technique": "technical", "technology": "technique", "technician": "technical",147  "millionaire": "million", "billionaire": "billion", "cigarette": "cigar",148  "adhesion": "adhere", "adhesive": "adhere", "chaotic": "chaos", "disclosure": "disclose",149  "destruction": "destroy", "concession": "concede", "rental": "rent",150  "influential": "influence", "strategic": "strategy", "minimal": "minimum",151  "mini": "minimum", "triangular": "triangle", "rebellion": "rebel",152  "intent": "intend", "replica": "replicate", "timer": "time", "timed": "time",153  "sparkle": "spark", "consensus": "consent", "probably": "probable", "pleasant": "please",154  "philosopher": "philosophy", "radiate": "radius", "tutorial": "tutor",155  "terminal": "terminus", "terminate": "terminus", "grief": "grieve", "grievance": "grieve",156  "anime": "animate", "surgeon": "surgery", "partition": "part", "pretense": "pretend",157  "concept": "conceive", "conceptual": "conceive", "solidarity": "solid",158  "economic": "economy", "economist": "economy", "decisive": "decide",159  "offense": "offend", "offensive": "offend", "necessary": "necessity",160  "frequency": "frequent", "portrait": "portray", "digital": "digit",161  "storage": "store", "nearly": "near", "granny": "grandmother", "sorry": "sorrow",162  "modification": "modify", "characteristic": "character", "anxious": "anxiety",163  "quantify": "quantity", "qualify": "quality", "appendix": "append",164  "quantitative": "quantity", "qualitative": "quality", "supremacy": "supreme",165  "imaginary": "imagine", "imaginative": "imagine", "disastrous": "disaster",166  "systematic": "system", "absorption": "absorb", "disciple": "discipline",167  "favorite": "favor", "prescription": "prescribe", "dominant": "dominate",168  "relief": "relieve", "laughter": "laugh", "participant": "participate",169  "companion": "company", "circular": "circle", "synthetic": "synthesis",170  "choice": "choose", "entrance": "entry", "maintenance": "maintain",171  "social": "society", "substantial": "substance", "identification": "identify",172  "assumption": "assume", "poet": "poetry", "info": "information", "information": "inform",173  "visible": "vision", "realistic": "real", "consumption": "consume", "reception": "recept",174  "photo": "photograph", "demo": "demonstrate", "publish": "public",175  "volunteer": "voluntary", "politician": "politics", "rationale": "rational",176  "physician": "physic", "physicist": "physics", "spectral": "specter",177  "birdie": "bird", "distillate": "distill", "earnings": "earn", "chimp": "chimpanzee",178  "nutrient": "nutrition", "nutritive": "nutrition", "delicacy": "delicate",179  "suspicion": "suspect", "disbelief": "disbelieve", "provocative": "provoke",180  "irritant": "irritate", "displeasure": "displease", "erroneous": "error",181  "humility": "humiliate", "consequence": "consequent", "barbaric": "barbarian",182  "mystic": "mystery", "festive": "festival", "festal": "festival", "intimacy": "intimate",183  "respiratory": "respiration", "respirator": "respiration", "sarcastic": "sarcasm",184  "crucify": "crucifix", "crucifixion": "crucifix", "abdominal": "abdomen",185  "medial": "median", "bureaucrat": "bureau", "wholly": "whole", "consul": "consulate",186  "repetition": "repeat", "repetitive": "repeat", "conquest": "conquer", "cavern": "cave",187  "rubbish": "rubble", "flammable": "flame", "ignorant": "ignore", "solitude": "solitary",188  "curiosity": "curious", "exceptionally": "exceptional", "blotch": "blot", "suckle": "suck",189  "negligent": "neglect", "negligence": "neglect", "infamous": "infamy",190  "deception": "deceit", "deceit": "deceive", "deceptive": "deceive",191  "irritable": "irritate", "prevalent": "prevail", "accusatory": "accuse",192  "conspiracy": "conspire", "envelop": "envelope", "capacitance": "capacitor",193  "romantic": "romance", "perm": "permanent", "feminist": "feminine",194  "demolition": "demolish", "trivial": "trivia", "instantaneous": "instant",195  "expense": "expend", "expenditure": "expend", "memorize": "memory",196  "memo": "memorandum", "consortium": "consort", "medallion": "medal", "godless": "god",197  "abrasion": "abrase", "abrasive": "abrase", "atheist": "atheism", "reunion": "reunite",198  "kindergartner": "kindergarten", "duckling": "duck", "introductory": "introduce",199  "baptism": "baptize", "sled": "sledge", "bobsled": "bobsleigh",200  "tarp": "tarpaulin", "intricacy": "intricate", "reverberate": "reverb",201  "glacial": "glacier", "legislature": "legislate", "redemption": "redeem",202  "predominant": "predominate", "lull": "lullaby", "butt": "buttock", "comfy": "comfort",203  "verification": "verify", "spectacular": "spectacle", "applause": "applaud",204  "theoretical": "theory", "curvature": "curve", "simply": "simple", "cafe": "cafeteria",205  "discussion": "discuss", "comparable": "compare", "comparative": "compare",206  "hysteric": "hysteria", "partial": "part", "generosity": "generous", "maths": "math",207  "prophecy": "prophesy", "prophet": "prophecy", "satisfactory": "satisfy",208  "fulfillment": "fulfill", "sufficient": "suffice", "energetic": "energy",209  "cosmic": "cosmos", "petrol": "petroleum", "applicable": "apply", "splendid": "splendor",210  "reproductive": "reproduce", "apologetic": "apology", "nervous": "nerve",211  "metabolic": "metabolism", "potency": "potent", "impotency": "impotent", "penal": "penalty",212  "migratory": "migrate", "migrant": "migrate", "immigrant": "immigrate", "emigrant": "emigrate",213  "amphibious": "amphibian", "menstrual": "menstruation", "president": "preside",214  "receptionist": "reception", "reception": "receive", "receipt": "receive",215  "receptive": "receive", "remembrance": "remember", "heartbroken": "heartbreak",216  "residential": "residence", "residency": "residence", "resident": "residence",217  "preparatory": "prepare", "glamorous": "glamour", "defense": "defend",218  "cellular": "cell", "viscosity": "viscous", "rhino": "rhinoceros", "hippo": "hippopotamus",219  "ancestral": "ancestor", "negative": "negate", "bacteria": "bacterium",220  "registration": "register", "registry": "register", "inaugural": "inaugurate",221  "alkaline": "alkali", "humane": "human", "divisible": "divide", "capacity": "capable",222  "grandpa": "grandfather", "grandma": "grandmother", "nauseous": "nausea",223  "luncheon": "lunch", "conscientious": "conscience", "mandatory": "mandate",224  "cleric": "clergy", "corrosion": "corrode", "limo": "limousine", "descriptive": "describe",225  "inflammable": "inflame", "inflammation": "inflame", "tremble": "tremor",226  "enthusiast": "enthusiasm", "pussy": "puss", "considerate": "consider",227  "eternity": "eternal", "monstrous": "monster", "clarity": "clarify", "illiteracy": "illiterate",228  "clarification": "clarify", "muscular": "muscle", "furniture": "furnish",229  "perception": "perceive", "percept": "perceive", "sensory": "sense", "symptomatic": "symptom",230  "destination": "destine", "categorical": "category", "ascent": "ascend",231  "ingenuity": "ingenious", "invention": "invent", "gymnast": "gymnastic",232  "propel": "propulsion", "belief": "believe", "whimsy": "whim", "disciplinary": "discipline",233  "mischievous": "mischief", "crazy": "craze", "liquefy": "liquid", "delicacy": "delicate",234  "confectionery": "confection", "resilience": "resilient", "grad": "graduate",235  "therapist": "therapy", "perseverance": "persevere", "intro": "introduction",236  "abolition": "abolish", "reparation": "repair", "testify": "testimony", "sports": "sport",237  "disqualification": "disqualify", "rectangular": "rectangle", "metropolitan": "metropolis",238  "sportsmanship": "sportsman", "atheist": "atheism", "prognostic": "prognosis",239  "assurance": "assure", "insurance": "insure", "extent": "extend", "mineral": "mine",240  "fort": "fortress", "pharmaceutical": "pharmacy", "menstrual": "menstruum",241  "community": "commune", "communal": "commune", "data": "datum", "agenda": "agendum",242  "metric": "meter", "democrat": "democracy", "presumption": "presume", "shelf": "shelve",243  "solitaire": "solitary", "explanatory": "explain", "woolen": "wool", "planar": "plain",244  "northeastern": "northeast", "northwestern": "northwest",245  "southeastern": "southeast", "southwestern": "southwest",246  "eastward": "east", "westward": "west", "wintry": "winter",247}248class BuildUnionDBBatch:249  def __init__(self, input_confs, output_path, core_labels, full_def_labels, gross_labels,250               surfeit_labels, top_labels, slim_labels, tran_list_labels, supplement_labels,251               phrase_prob_path, tran_prob_path, tran_aux_paths, tran_aux_last_paths,252               rev_prob_path, cooc_prob_path, aoa_paths, keyword_path, min_prob_map):253    self.input_confs = input_confs254    self.output_path = output_path255    self.core_labels = core_labels256    self.full_def_labels = full_def_labels257    self.gross_labels = gross_labels258    self.surfeit_labels = surfeit_labels259    self.top_labels = top_labels260    self.slim_labels = slim_labels261    self.tran_list_labels = tran_list_labels262    self.supplement_labels = supplement_labels263    self.phrase_prob_path = phrase_prob_path264    self.tran_prob_path = tran_prob_path265    self.tran_aux_paths = tran_aux_paths266    self.tran_aux_last_paths = tran_aux_last_paths267    self.rev_prob_path = rev_prob_path268    self.cooc_prob_path = cooc_prob_path269    self.aoa_paths = aoa_paths270    self.keyword_path = keyword_path271    self.min_prob_map = min_prob_map272    self.tokenizer = tkrzw_tokenizer.Tokenizer()273  def Run(self):274    start_time = time.time()275    logger.info("Process started: input_confs={}, output_path={}".format(276      str(self.input_confs), self.output_path))277    word_dicts = []278    for label, input_path in self.input_confs:279      slim = label in self.slim_labels280      word_dict = self.ReadInput(input_path, slim)281      word_dicts.append((label, word_dict))282    aux_trans = {}283    for tran_aux_path in self.tran_aux_paths:284      if not tran_aux_path: continue285      self.ReadTranAuxTSV(tran_aux_path, aux_trans)286    aux_last_trans = {}287    for tran_aux_last_path in self.tran_aux_last_paths:288      if not tran_aux_last_path: continue289      self.ReadTranAuxTSV(tran_aux_last_path, aux_last_trans)290    raw_aoa_words = collections.defaultdict(list)291    for aoa_path in self.aoa_paths:292      if not aoa_path: continue293      self.ReadAOAWords(aoa_path, raw_aoa_words)294    aoa_words = {}295    for word, values in raw_aoa_words.items():296      aoa_words[word] = sum(values) / len(values)297    keywords = set()298    if self.keyword_path:299      self.ReadKeywords(self.keyword_path, keywords)300    self.SaveWords(word_dicts, aux_trans, aux_last_trans, aoa_words, keywords)301    logger.info("Process done: elapsed_time={:.2f}s".format(time.time() - start_time))302  def NormalizeText(self, text):303    text = unicodedata.normalize('NFKC', text)304    text = regex.sub(r"[\u2018\u2019\u201A\u201B\u2758\u275B\u275C\u275F\uFF02]", "'", text)305    text = regex.sub(r"[\u201C\u201D\u201E\u201F]", '"', text)306    text = regex.sub(307      r"[\u00AD\u02D7\u2010\u2011\u2012\u2013\u2014\u2015\u2043\u2212\u2796\u2E3A\u2E3B" +308      r"\uFE58\uFE63\uFF0D]", "-", text)309    return text310  def ReadInput(self, input_path, slim):311    start_time = time.time()312    logger.info("Reading an input file: input_path={}".format(input_path))313    word_dict = collections.defaultdict(list)314    num_entries = 0315    with open(input_path) as input_file:316      for line in input_file:317        word = ""318        ipa = ""319        sampa = ""320        texts = []321        inflections = {}322        etymologies = {}323        alternatives = []324        mode = ""325        rel_words = {}326        for field in line.strip().split("\t"):327          columns = field.split("=", 1)328          if len(columns) < 2: continue329          name, value = columns330          value = self.NormalizeText(value)331          value = regex.sub(r"[\p{Z}\p{C}]+", " ", value).strip()332          if name == "word":333            word = value334          elif name == "pronunciation_ipa":335            ipa = value336          elif name == "pronunciation_sampa":337            sampa = value338          elif name.startswith("inflection_"):339            name = regex.sub(r"^[a-z]+_", "", name)340            inflections[name] = inflections.get(name) or value341          elif name.startswith("etymology_"):342            etymologies[name] = value343          elif name == "alternative":344            for alt_word in value.split(","):345              alt_word = alt_word.strip()346              if alt_word:347                alternatives.append(alt_word)348          elif name in poses:349            if slim:350              value = regex.sub(r" \[-+\] .*", "", value).strip()351            if value:352              texts.append((name, value))353          elif name in rel_weights:354            rel_words[name] = value355          elif name == "mode":356            mode = value357        if not ipa and sampa:358          ipa = tkrzw_pron_util.SampaToIPA(sampa)359        if not word or len(word) > 48:360          continue361        if ipa or texts or inflections or etymologies or alternatives:362          key = tkrzw_dict.NormalizeWord(word)363          entry = {"word": word}364          if ipa:365            entry["pronunciation"] = ipa366          for name, value in inflections.items():367            entry[name] = value368          for name, value in etymologies.items():369            entry[name] = value370          if alternatives:371            entry["alternative"] = alternatives372          entry["text"] = texts373          for rel_name, rel_value in rel_words.items():374            entry[rel_name] = rel_value375          if mode:376            key += "\t" + mode377          word_dict[key].append(entry)378          num_entries += 1379        if num_entries % 10000 == 0:380          logger.info("Reading an input: num_entries={}".format(num_entries))381    logger.info("Reading an input done: num_entries={}, elapsed_time={:.2f}s".format(382      num_entries, time.time() - start_time))383    return word_dict384  def ReadTranAuxTSV(self, input_path, aux_trans):385    start_time = time.time()386    logger.info("Reading a translation aux file: input_path={}".format(input_path))387    num_entries = 0388    with open(input_path) as input_file:389      for line in input_file:390        fields = line.strip().split("\t")391        if len(fields) < 2: continue392        word = self.NormalizeText(fields[0])393        values = aux_trans.get(word) or []394        uniq_trans = set()395        for tran in fields[1:]:396          tran = self.NormalizeText(tran)397          tran = regex.sub(r"[\p{Ps}\p{Pe}\p{C}]", "", tran)398          tran = regex.sub(r"[\p{Z}\p{C}]+", " ", tran).strip()399          norm_tran = tkrzw_dict.NormalizeWord(tran)400          if not tran or not norm_tran: continue401          if regex.search(r"\p{Latin}.*ã®.*(å½¢|åè©|ç´)", tran): continue402          if norm_tran in uniq_trans: continue403          uniq_trans.add(norm_tran)404          values.append(tran)405        aux_trans[word] = values406        num_entries += 1407        if num_entries % 10000 == 0:408          logger.info("Reading a translation aux file: num_entries={}".format(num_entries))409    logger.info("Reading a translation aux file: num_entries={}, elapsed_time={:.2f}s".format(410      num_entries, time.time() - start_time))411  def ReadAOAWords(self, input_path, aoa_words):412    start_time = time.time()413    logger.info("Reading a AOA file: input_path={}".format(input_path))414    num_entries = 0415    with open(input_path) as input_file:416      is_first = True417      for line in input_file:418        if is_first:419          is_first = False420          continue421        fields = line.strip().split(",")422        if len(fields) != 7: continue423        word = self.NormalizeText(fields[0]).strip()424        occur = fields[3]425        mean = fields[4]426        stddev = fields[5]427        if not word or not regex.fullmatch(r"[0-9.]+", mean): continue428        if not regex.fullmatch(r"[.0-9]+", occur): continue429        mean = float(mean)430        if regex.fullmatch(r"[0-9.]+", stddev):431          mean += float(stddev)432        else:433          mean += 3.0434        aoa_words[word].append(mean)435        num_entries += 1436        if num_entries % 10000 == 0:437          logger.info("Reading a AOA file: num_entries={}".format(num_entries))438    logger.info("Reading a translation aux file: num_entries={}, elapsed_time={:.2f}s".format(439      num_entries, time.time() - start_time))440  def ReadKeywords(self, input_path, keywords):441    start_time = time.time()442    logger.info("Reading a keyword file: input_path={}".format(input_path))443    num_entries = 0444    with open(input_path) as input_file:445      for line in input_file:446        keyword = self.NormalizeText(line).strip()447        keywords.add(keyword)448        num_entries += 1449        if num_entries % 10000 == 0:450          logger.info("Reading a keyword file: num_entries={}".format(num_entries))451    logger.info("Reading a translation aux file: num_entries={}, elapsed_time={:.2f}s".format(452      num_entries, time.time() - start_time))453  def SaveWords(self, word_dicts, aux_trans, aux_last_trans, aoa_words, keywords):454    logger.info("Preparing DBMs")455    phrase_prob_dbm = None456    if self.phrase_prob_path:457      phrase_prob_dbm = tkrzw.DBM()458      phrase_prob_dbm.Open(self.phrase_prob_path, False, dbm="HashDBM").OrDie()459    tran_prob_dbm = None460    if self.tran_prob_path:461      tran_prob_dbm = tkrzw.DBM()462      tran_prob_dbm.Open(self.tran_prob_path, False, dbm="HashDBM").OrDie()463    rev_prob_dbm = None464    if self.rev_prob_path:465      rev_prob_dbm = tkrzw.DBM()466      rev_prob_dbm.Open(self.rev_prob_path, False, dbm="HashDBM").OrDie()467    cooc_prob_dbm = None468    if self.cooc_prob_path:469      cooc_prob_dbm = tkrzw.DBM()470      cooc_prob_dbm.Open(self.cooc_prob_path, False, dbm="HashDBM").OrDie()471    start_time = time.time()472    logger.info("Extracting keys")473    keys = set()474    for label, word_dict in word_dicts:475      for key in word_dict.keys():476        if key.find("\t") >= 0: continue477        keys.add(key)478    logger.info("Extracting keys done: num_keys={}, elapsed_time={:.2f}s".format(479      len(keys), time.time() - start_time))480    start_time = time.time()481    logger.info("Indexing stems")482    stem_index = collections.defaultdict(list)483    for label, word_dict in word_dicts:484      if label in self.supplement_labels: continue485      for key in keys:486        for entry in word_dict[key]:487          word = entry["word"]488          if not regex.fullmatch("[a-z]+", word): continue489          stems = self.GetDerivativeStems(label, entry, word_dict, aux_trans, phrase_prob_dbm)490          if stems:491            valid_stems = set()492            for stem in stems:493              if stem in keys:494                stem_index[stem].append(word)495                valid_stems.add(stem)496            if valid_stems:497              entry["stem"] = list(valid_stems.union(set(entry.get("stem") or [])))498    for label, word_dict in word_dicts:499      if label not in self.core_labels: continue500      for key in keys:501        for entry in word_dict[key]:502          word = entry["word"]503          children = stem_index.get(word)504          if children:505            entry["stem_child"] = list(set(children))506    logger.info("Indexing stems done: num_stems={}, elapsed_time={:.2f}s".format(507      len(stem_index), time.time() - start_time))508    start_time = time.time()509    logger.info("Checking POS of words")510    noun_words = set()511    verb_words = set()512    adj_words = set()513    adv_words = set()514    for label, word_dict in word_dicts:515      if label in self.core_labels:516        for key in keys:517          for entry in word_dict[key]:518            word = entry["word"]519            for pos, text in entry["text"]:520              if pos == "noun": noun_words.add(word)521              if pos == "verb": verb_words.add(word)522              if pos == "adjective": adj_words.add(word)523              if pos == "adverb": adv_words.add(word)524    logger.info("Checking POS of words done: elapsed_time={:.2f}s".format(525      time.time() - start_time))526    start_time = time.time()527    logger.info("Indexing base forms")528    extra_word_bases = {}529    for label, word_dict in word_dicts:530      if label not in self.top_labels: continue531      base_index = collections.defaultdict(list)532      core_index = collections.defaultdict(list)533      for key, entries in word_dict.items():534        for entry in entries:535          word = entry["word"]536          if not regex.fullmatch("[a-z]+", word): continue537          if word in verb_words:538            children = set()539            for part_name in ("verb_present_participle", "verb_past_participle"):540              for part in (entry.get(part_name) or "").split(","):541                part = part.strip()542                if part and part != word and (part in noun_words or part in adj_words):543                  base_index[part].append(word)544                  extra_word_bases[part] = word545                  children.add(part)546            if children:547              entry["base_child"] = list(children)548          if word in adj_words:549            children = set()550            for part_name in ("adjective_comparative", "adjective_superlative"):551              part = entry.get(part_name)552              if part and (part in noun_words or part in adj_words):553                base_index[part].append(word)554                children.add(part)555            if children:556              entry["base_child"] = list(children)557          core = entry.get("etymology_core")558          prefix = entry.get("etymology_prefix")559          suffix = entry.get("etymology_suffix")560          if core and len(core) >= 4 and not prefix and suffix:561            entry["core"] = core562            core_index[core].append(word)563      for key, entries in word_dict.items():564        for entry in entries:565          word = entry["word"]566          if not regex.fullmatch("[a-z]+", word): continue567          bases = base_index.get(word)568          if bases:569            entry["base"] = list(bases)570          children = core_index.get(word)571          if children:572            entry["core_child"] = list(children)573    logger.info("Indexing base forms done: elapsed_time={:.2f}s".format(574      time.time() - start_time))575    start_time = time.time()576    logger.info("Merging entries: num_keys={}".format(len(keys)))577    merged_entries = []578    for key in keys:579      merged_entry = self.MergeRecord(580        key, word_dicts, aux_trans, aoa_words, keywords,581        phrase_prob_dbm, tran_prob_dbm, rev_prob_dbm, cooc_prob_dbm)582      if not merged_entry: continue583      merged_entries.append((key, merged_entry))584      if len(merged_entries) % 1000 == 0:585        logger.info("Merging entries:: num_entries={}".format(len(merged_entries)))586    logger.info("Making records done: num_records={}, elapsed_time={:.2f}s".format(587      len(merged_entries), time.time() - start_time))588    start_time = time.time()589    logger.info("Modifying entries")590    merged_entries = sorted(merged_entries)591    live_words = tkrzw.DBM()592    live_words.Open("", True, dbm="BabyDBM").OrDie()593    rev_live_words = tkrzw.DBM()594    rev_live_words.Open("", True, dbm="BabyDBM").OrDie()595    for key, merged_entry in merged_entries:596      for word_entry in merged_entry:597        word = word_entry["word"]598        prob = float(word_entry.get("probability") or 0)599        value = "{:.8f}".format(prob)600        live_words.Set(word, value).OrDie()601        rev_word = " ".join(reversed(word.split(" ")))602        rev_live_words.Set(rev_word, value).OrDie()603    num_entries = 0604    for key, merged_entry in merged_entries:605      for word_entry in merged_entry:606        word = word_entry["word"]607        entries = []608        for label, word_dict in word_dicts:609          dict_entries = word_dict.get(key)610          if not dict_entries: continue611          for entry in dict_entries:612            if entry["word"] == word:613              entries.append((label, entry))614        self.SetAOA(word_entry, entries, aoa_words, live_words, phrase_prob_dbm)615        self.SetTranslations(word_entry, aux_trans, tran_prob_dbm, rev_prob_dbm)616        self.SetRelations(word_entry, entries, word_dicts, live_words, rev_live_words,617                          phrase_prob_dbm, tran_prob_dbm, cooc_prob_dbm, extra_word_bases,618                          verb_words, adj_words, adv_words)619        if phrase_prob_dbm and cooc_prob_dbm:620          self.SetCoocurrences(word_entry, entries, word_dicts, phrase_prob_dbm, cooc_prob_dbm)621      num_entries += 1622      if num_entries % 1000 == 0:623        logger.info("Modifying entries: num_records={}".format(num_entries))624    logger.info("Modifying entries done: elapsed_time={:.2f}s".format(time.time() - start_time))625    start_time = time.time()626    logger.info("Finishing entries")627    merged_dict = {}628    for key, merged_entry in merged_entries:629      merged_dict[key] = merged_entry630    num_entries = 0631    for key, merged_entry in merged_entries:632      for word_entry in merged_entry:633        self.CompensateInflections(word_entry, merged_dict, verb_words)634        self.CompensateAlternatives(word_entry, merged_dict)635        self.PropagateTranslations(word_entry, merged_dict, tran_prob_dbm, aux_last_trans)636      num_entries += 1637      if num_entries % 1000 == 0:638        logger.info("Finishing entries R1: num_records={}".format(num_entries))639    num_entries = 0640    for key, merged_entry in merged_entries:641      for word_entry in merged_entry:642        self.SetPhraseTranslations(word_entry, merged_dict, aux_trans, aux_last_trans,643                                   tran_prob_dbm, phrase_prob_dbm, noun_words, verb_words,644                                   live_words, rev_live_words)645        self.FilterParents(word_entry, merged_dict)646        self.AbsorbInflections(word_entry, merged_dict)647      num_entries += 1648      if num_entries % 1000 == 0:649        logger.info("Finishing entries R2: num_records={}".format(num_entries))650    logger.info("Finishing entries done: elapsed_time={:.2f}s".format(time.time() - start_time))651    rev_live_words.Close().OrDie()652    live_words.Close().OrDie()653    if cooc_prob_dbm:654      cooc_prob_dbm.Close().OrDie()655    if rev_prob_dbm:656      rev_prob_dbm.Close().OrDie()657    if tran_prob_dbm:658      tran_prob_dbm.Close().OrDie()659    if phrase_prob_dbm:660      phrase_prob_dbm.Close().OrDie()661    start_time = time.time()662    logger.info("Saving records: output_path={}".format(self.output_path))663    word_dbm = tkrzw.DBM()664    num_buckets = len(merged_entries) * 2665    word_dbm.Open(self.output_path, True, dbm="HashDBM", truncate=True,666                  align_pow=0, num_buckets=num_buckets)667    num_records = 0668    for key, merged_entry in merged_entries:669      final_entry = []670      for word_entry in merged_entry:671        if word_entry.get("deleted"):672          continue673        for attr_name in list(word_entry.keys()):674          if attr_name.startswith("_"):675            del word_entry[attr_name]676        final_entry.append(word_entry)677      if not final_entry: continue678      serialized = json.dumps(final_entry, separators=(",", ":"), ensure_ascii=False)679      word_dbm.Set(key, serialized)680      num_records += 1681      if num_records % 1000 == 0:682        logger.info("Saving records: num_records={}".format(num_records))683    word_dbm.Close().OrDie()684    logger.info("Saving records done: num_records={}, elapsed_time={:.2f}s".format(685      len(merged_entries), time.time() - start_time))686  def GetDerivativeStems(self, label, entry, word_dict, aux_trans, phrase_prob_dbm):687    word = entry["word"]688    prob = 1.0689    if phrase_prob_dbm:690      prob = self.GetPhraseProb(phrase_prob_dbm, "en", word)691    texts = entry.get("text") or []692    def NormalizeTran(tran):693      tran = tran.strip()694      han_tran = regex.sub(r"[^\p{Han}]", "", tran)695      if len(han_tran) >= 2:696        tran = han_tran697      elif regex.fullmatch(r"\p{Han}\p{Hiragana}+", tran):698        poses = self.tokenizer.GetJaPosList(tran)699        while len(poses) >= 2:700          pos = poses[-1]701          if not regex.fullmatch(r"\p{Hiragana}+", pos[0]): break702          if pos[1] not in ["å©è©", "å©åè©"] and pos[2] not in ["æ¥å°¾", "éèªç«"]: break703          poses = poses[:-1]704        norm_tran = ""705        for pos in poses:706          norm_tran += pos[3]707        if len(norm_tran) >= 2:708          tran = norm_tran709      tran = regex.sub(r"^([\p{Han}]{2,})ç$", "\1", tran)710      return tran711    def GetMetadata(in_entry, out_poses, out_deris, out_trans):712      in_word = in_entry["word"]713      for pos, text in in_entry["text"]:714        out_poses.add(pos)715        for part in text.split("[-]"):716          part = part.strip()717          match = regex.search(r"^\[(synonym|derivative)\]: (.*)", part)718          if match:719            expr = regex.sub(r"\[-.*", "", match.group(2))720            for deri in expr.split(","):721              deri = deri.strip()722              if regex.fullmatch("[a-z]+", deri):723                out_deris.add(deri)724          match = regex.search(r"^\[translation\]: (.*)", part)725          if match:726            expr = regex.sub(r"\[-.*", "", match.group(1))727            expr = regex.sub(r"\(.*?\)", "", expr).strip()728            for tran in expr.split(","):729              tran = NormalizeTran(tran)730              if len(tran) >= 2:731                out_trans.add(tran)732        if label in self.gross_labels:733          text = regex.sub(r"\(.*?\)", "", text)734          text = regex.sub(r"ï¼.*?ï¼", "", text)735          for tran in regex.split(r"[,ãã]", text):736            tran = NormalizeTran(tran)737            if len(tran) >= 2:738              out_trans.add(tran)739      relation_expr = in_entry.get("relation")740      if relation_expr:741        for rel_word in relation_expr.split(","):742          check_len = max(5, len(rel_word) - 1)743          if rel_word[:check_len] == word[:check_len]:744            out_deris.add(rel_word)745      in_aux_trans = aux_trans.get(in_word)746      if in_aux_trans:747        for tran in in_aux_trans:748          tran = NormalizeTran(tran)749          if len(tran) >= 2:750            out_trans.add(tran)751    poses = set()752    deris = set()753    trans = set()754    GetMetadata(entry, poses, deris, trans)755    deri_probs = {}756    deri_trans = {}757    if phrase_prob_dbm:758      for deri in deris:759        if deri[:3] != word[:3]: continue760        deri_probs[deri] = self.GetPhraseProb(phrase_prob_dbm, "en", deri)761        for deri_entry in word_dict.get(deri) or []:762          if deri_entry["word"] == deri:763            one_deri_poses = set()764            one_deri_deris = set()765            one_deri_trans = set()766            GetMetadata(deri_entry, one_deri_poses, one_deri_deris, one_deri_trans)767            if one_deri_trans:768              deri_trans[deri] = one_deri_trans769    stems = set()770    for pos in poses:771      for rule_pos, suffixes in (772          ("noun", noun_suffixes),773          ("verb", verb_suffixes),774          ("adjective", adjective_suffixes),775          ("adverb", adverb_suffixes)):776        if pos == rule_pos:777          for suffix in suffixes:778            if word.endswith(suffix):779              stem = word[:-len(suffix)]780              if len(stem) >= 2:781                stems.add(stem)782                if len(suffix) >= 2 and stem[-1] == suffix[0]:783                  stems.add(stem + suffix[0])784                if len(suffix) >= 2 and stem[-1] == "i":785                  stems.add(stem[:-1] + "y")786                if len(suffix) >= 2 and suffix[0] == "i":787                  stems.add(stem + "e")788                if len(suffix) >= 2 and suffix[0] == "e":789                  stems.add(stem + "e")790                if len(suffix) >= 2 and suffix[0] == "t":791                  stems.add(stem + "t")792                if len(suffix) >= 3 and suffix[0] == "s":793                  stems.add(stem + "s")794                if suffix == "al" and len(stem) >= 3:795                  stems.add(stem + "es")796                  stems.add(stem + "e")797                if suffix == "y" and len(stem) >= 3:798                  stems.add(stem + "e")799                if suffix in ["tion", "sion"] and len(stem) >= 2:800                  stems.add(stem + "e")801                  stems.add(stem + "d")802                  stems.add(stem + "t")803                  stems.add(stem + "s")804                  stems.add(stem + "te")805                  stems.add(stem + "de")806                  stems.add(stem + "se")807                  stems.add(stem + "ve")808                if suffix in ["tion", "sion"] and len(stem) >= 3 and stem.endswith("a"):809                  stems.add(stem[:-1])810                  stems.add(stem[:-1] + "e")811                  stems.add(stem[:-1] + "ate")812                if suffix in ["tion", "sion"] and len(stem) >= 3 and stem.endswith("u"):813                  stems.add(stem[:-1] + "ve")814                if suffix == "sion" and len(stem) >= 3 and stem.endswith("s"):815                  stems.add(stem[:-1] + "t")816                if suffix in ["ible", "able"] and len(stem) >= 2:817                  stems.add(stem + "or")818                  stems.add(stem + "er")819                  stems.add(stem + "ify")820                  stems.add(stem + "y")821                if suffix == "ate":822                  stems.add(stem + "e")823                if suffix == "al" and len(stem) >= 3 and stem.endswith("r"):824                  stems.add(stem[:-1] + "er")825                if suffix == "ive" and len(stem) >= 3 and stem.endswith("s"):826                  stems.add(stem[:-1] + "d")827                  stems.add(stem[:-1] + "de")828                if suffix == "ic" and len(stem) >= 3:829                  stems.add(stem + "y")830                if suffix == "ize" and len(stem) >= 3:831                  stems.add(stem + "y")832                if suffix == "ity" and len(stem) >= 6 and stem.endswith("bil"):833                  stems.add(stem[:-3] + "ble")834                if suffix == "pt" and len(stem) >= 3:835                  stems.add(stem[:-1] + "ve")836                if suffix == "ce" and len(stem) >= 3:837                  stems.add(stem + "t")838                  stems.add(stem + "d")839                  stems.add(stem + "se")840                if suffix == "ian" and len(stem) >= 4:841                  stems.add(stem + "y")842                if suffix == "cy" and len(stem) >= 4:843                  stems.add(stem + "t")844                if suffix == "faction" and len(stem) >= 4:845                  stems.add(stem + "fy")846                if suffix == "ous" and len(stem) >= 4:847                  stems.add(stem + "on")848                  stems.add(stem + "y")849                  stems.add(stem + "e")850                if suffix == "ous" and len(stem) >= 5 and stem.endswith("ul"):851                  stems.add(stem[:-2] + "le")852                if suffix == "ant" and len(stem) >= 4:853                  stems.add(stem + "ate")854                  stems.add(stem + "e")855                if suffix == "ative" and len(stem) >= 4:856                  stems.add(stem + "e")857                if suffix in ["er", "or", "ive"] and len(stem) >= 5:858                  stems.add(stem + "e")859                if len(stem) >= 3 and stem.endswith("u"):860                  stems.add(stem + "e")861                if len(stem) >= 4 and stem.endswith("i"):862                  stems.add(stem[:-1] + "e")863                if len(stem) >= 4 and stem.endswith("rr"):864                  stems.add(stem[:-1])865                if len(stem) >= 5 and stem.endswith("t"):866                  stems.add(stem[:-1] + "ce")867                  stems.add(stem[:-1] + "d")868                if len(stem) >= 5 and stem.endswith("v"):869                  stems.add(stem + "e")870                if len(stem) >= 8 and stem.endswith("tic"):871                  stems.add(stem + "s")872                if len(stem) >= 4 and stem[-1] == stem[-2]:873                  stems.add(stem[:-1])874    stems.discard(word)875    #print("STEM", label, word, stems)876    877    valid_stems = set()878    for pos, text in texts:879      match = regex.search(880        r'^[" ]*([\p{Latin}]+)[" ]*ã®(è¤æ°å½¢|ä¸äººç§°|ååè©|ç¾å¨åè©|éå»å½¢|éå»åè©)', text)881      if match:882        stem = match.group(1)883        if len(stem) >= 4 and word.startswith(stem):884          valid_stems.add(stem)885    for stem in stems:886      if phrase_prob_dbm:887        stem_prob = self.GetPhraseProb(phrase_prob_dbm, "en", stem)888      else:889        stem_prob = prob890      stem_prob_ratio = stem_prob / prob891      if label not in self.core_labels and stem_prob_ratio < 0.1: continue892      if (stem.find(" ") < 0 and len(stem) >= 8 and len(stem) < len(word) and893          stem_prob_ratio >= 0.5):894        valid_stems.add(stem)895        continue896      stem_entry = None897      for tmp_stem_entry in word_dict.get(stem) or []:898        if tmp_stem_entry["word"] == stem:899          stem_entry = tmp_stem_entry900      is_known = False901      if stem_prob_ratio >= 0.001:902        is_known = True903      if stem_entry and word in (stem_entry.get("related") or []):904        is_known = True905      if len(stem) >= 6 and stem_prob_ratio >= 0.0002:906        is_known = True907      stem_poses = set()908      stem_deris = set()909      stem_trans = set()910      if stem_entry:911        GetMetadata(stem_entry, stem_poses, stem_deris, stem_trans)912      if stem.find(" ") < 0 and len(stem) >= 4 and trans:913        hit_deri = False914        if word in stem_deris:915          hit_deri = True916        hit_tran = False917        for stem_tran in stem_trans:918          if stem_tran in trans:919            hit_tran = True920          if regex.search(r"\p{Han}", stem_tran):921            for tran in trans:922              if tran.find(stem_tran) >= 0 or stem_tran.find(tran) >= 0:923                hit_tran = True924        if ((hit_deri and hit_tran) or (stem_prob_ratio >= 0.1 and hit_deri) or925            (is_known and hit_tran)):926          valid_stems.add(stem)927      check_len = max(3, len(stem) - 2)928      for deri in deris:929        if len(word) < len(deri):930          continue931        deri_prob = deri_probs.get(deri) or 0.0932        deri_prob_ratio = deri_prob / prob933        hit_deri = False934        if deri == stem:935          hit_deri = True936        if stem[:check_len] == deri[:check_len] and len(stem) >= 4:937          prefix = deri[:len(stem)]938          if prefix == stem:939            hit_deri = True940          if len(prefix) >= 6 and tkrzw.Utility.EditDistanceLev(stem, prefix) < 2:941            hit_deri = True942        hit_tran = False943        for deri_tran in deri_trans.get(deri) or []:944          if deri_tran in trans:945            hit_tran = True946          if regex.search(r"\p{Han}", deri_tran):947            for tran in trans:948              if tran.find(deri_tran) >= 0:949                hit_tran = True950        if hit_deri and (deri_prob_ratio >= 0.1 or hit_tran):951          valid_stems.add(deri)952    force_parent = force_parents.get(word)953    if force_parent:954      valid_stems.clear()955      valid_stems.add(force_parent)956    valid_stems.discard(word)957    #print("VALID", word, valid_stems)958    959    return list(valid_stems)960  def MergeRecord(self, key, word_dicts, aux_trans, aoa_words, keywords,961                  phrase_prob_dbm, tran_prob_dbm, rev_prob_dbm, cooc_prob_dbm):962    word_entries = {}963    word_shares = collections.defaultdict(float)964    word_trans = collections.defaultdict(set)965    entry_tran_texts = collections.defaultdict(list)966    num_words = 0967    poses = collections.defaultdict(set)968    synonyms = collections.defaultdict(set)969    core = None970    for label, word_dict in word_dicts:971      dict_entries = word_dict.get(key)972      if not dict_entries: continue973      for entry in dict_entries:974        num_words += 1975        word = entry["word"]976        entries = word_entries.get(word) or []977        entries.append((label, entry))978        word_entries[word] = entries979        texts = entry.get("text")980        if texts:981          text_score = len(texts) * 1.0982          for pos, text in texts:983            poses[word].add(pos)984            trans = self.ExtractTextLabelTrans(text)985            if trans:986              text_score += 0.5987              word_trans[word].update(trans)988          word_shares[word] += math.log2(1 + text_score)989        expr = entry.get("synonym")990        if expr:991          for synonym in regex.split(r"[,;]", expr):992            synonym = synonym.strip()993            if regex.search(r"\p{Latin}", synonym) and synonym.lower() != word.lower():994              synonyms[word].add(synonym)995        if not core:996          core = entry.get("core")997      dict_entries = word_dict.get(key + "\ttranslation")998      if dict_entries:999        for entry in dict_entries:1000          word = entry["word"]1001          tran_texts = entry.get("text")1002          if not tran_texts: continue1003          for tran_pos, tran_text in tran_texts:1004            tran_key = word + "\t" + label + "\t" + tran_pos1005            entry_tran_texts[tran_key].append(tran_text)1006            trans = self.ExtractTextLabelTrans(tran_text)1007            if trans:1008              word_trans[word].update(trans)1009    sorted_word_shares = sorted(word_shares.items(), key=lambda x: x[1], reverse=True)1010    if len(sorted_word_shares) > 1 and aux_trans and tran_prob_dbm:1011      spell_ratios = {}1012      if phrase_prob_dbm:1013        word_probs = {}1014        for word, share in sorted_word_shares:1015          if word in word_probs: continue1016          prob = self.GetPhraseProb(phrase_prob_dbm, "en", word)1017          if not regex.search(r"\p{Lu}", word):1018            prob *= 1.11019          word_probs[word] = prob1020        sum_prob = sum([x[1] for x in word_probs.items()])1021        for word, prob in word_probs.items():1022          spell_ratios[word] = prob / sum_prob1023      word_scores = []1024      for word, share in sorted_word_shares:1025        score = 0.01026        if word in keywords:1027          score += 0.11028        cap_aux_trans = aux_trans.get(word) or []1029        if cap_aux_trans:1030          score += 0.11031        cap_word_trans = word_trans.get(word) or []1032        cap_trans = set(cap_aux_trans).union(cap_word_trans)1033        tran_score = 0.01034        if cap_trans:1035          key = tkrzw_dict.NormalizeWord(word)1036          tsv = tran_prob_dbm.GetStr(key)1037          if tsv:1038            fields = tsv.split("\t")1039            max_prob = 0.01040            sum_prob = 0.01041            for i in range(0, len(fields), 3):1042              src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])1043              if src != word:1044                prob *= 0.11045              if not regex.search(r"[\p{Han}\{Hiragana}]", trg):1046                prob *= 0.51047                if regex.search(r"\p{Lu}", src):1048                  prob *= 0.51049              if trg in cap_trans:1050                max_prob = max(max_prob, prob)1051                sum_prob += prob1052            tran_score += (sum_prob * max_prob) ** 0.51053        spell_score = (spell_ratios.get(word) or 0.0) * 0.51054        score += ((tran_score + 0.05) * (share + 0.05) * (spell_score + 0.05)) ** (1 / 3)1055        word_scores.append((word, score))1056      sorted_word_shares = sorted(word_scores, key=lambda x: x[1], reverse=True)1057    share_sum = sum([x[1] for x in sorted_word_shares])1058    merged_entry = []1059    for word, share in sorted_word_shares:1060      entries = word_entries[word]1061      word_entry = {}1062      word_entry["word"] = word1063      stem = " ".join(self.tokenizer.Tokenize("en", word, False, True))1064      effective_labels = set()1065      surfaces = set([word.lower()])1066      is_keyword = (word in aux_trans or word in aoa_words or word in keywords or1067                    (core and core in keywords))1068      word_poses = poses[word]1069      for pos in word_poses:1070        for rule_pos, suffixes in (1071            ("noun", noun_suffixes),1072            ("verb", verb_suffixes),1073            ("adjective", adjective_suffixes),1074            ("adverb", adverb_suffixes)):1075          if pos == rule_pos:1076            for suffix in suffixes:1077              if word.endswith(suffix):1078                pos_stem = word[:-len(suffix)]1079                if len(pos_stem) >= 4:1080                  pos_stems = set()1081                  pos_stems.add(pos_stem)1082                  pos_stems.add(regex.sub(r"i$", r"y", pos_stem))1083                  for pos_stem in pos_stems:1084                    if pos_stem in aux_trans or pos_stem in aoa_words or pos_stem in keywords:1085                      is_keyword = True1086                      break1087      if not is_keyword and "verb" in word_poses and regex.fullmatch(r"[a-z ]+", word):1088        tokens = self.tokenizer.Tokenize("en", word, False, False)1089        if len(tokens) >= 2 and tokens[0] in keywords:1090          particle_suffix = True1091          for token in tokens[1:]:1092            if not token in particles:1093              particle_suffix = False1094              break1095          if particle_suffix:1096            is_keyword = True1097      is_super_keyword = is_keyword and bool(regex.fullmatch(r"\p{Latin}{3,}", word))1098      for label, entry in entries:1099        if label not in self.surfeit_labels or is_keyword:1100          effective_labels.add(label)1101        for top_name in top_names:1102          if label not in self.top_labels and top_name in word_entry: continue1103          value = entry.get(top_name)1104          if value:1105            value = unicodedata.normalize('NFKC', value)1106            word_entry[top_name] = value1107        for infl_name in inflection_names:1108          value = entry.get(infl_name)1109          if value:1110            surfaces.add(value.lower())1111      if merged_entry and not effective_labels:1112        continue1113      for label, entry in entries:1114        texts = entry.get("text")1115        if not texts: continue1116        has_good_text = False1117        for pos, text in texts:1118          pure_text = regex.sub(r"[^\p{Latin}\p{Han}\p{Hiragana}\p{Katakana}\d]", "", text)1119          if not pure_text or pure_text == stem:1120            continue1121          has_good_text = True1122        if not has_good_text:1123          continue1124        for pos, text in texts:1125          items = word_entry.get("item") or []1126          tran_key = word + "\t" + label + "\t" + pos1127          sections = []1128          for section in text.split(" [-] "):1129            if not sections:1130              sections.append(section)1131              continue1132            eg_match = regex.search(r"^e\.g\.: (.*)", section)1133            if eg_match:1134              eg_text = eg_match.group(1).lower()1135              eg_words = regex.findall("[-\p{Latin}]+", eg_text)1136              hit = False1137              for surface in surfaces:1138                if surface in eg_words:1139                  hit = True1140                  break1141              if not hit: continue1142            sections.append(section)1143          text = " [-] ".join(sections)1144          tran_texts = entry_tran_texts.get(tran_key)1145          if tran_texts:1146            del entry_tran_texts[tran_key]1147            for tran_text in tran_texts:1148              tran_item = {"label": label, "pos": pos, "text": tran_text}1149              items.append(tran_item)1150          item = {"label": label, "pos": pos, "text": text}1151          items.append(item)1152          word_entry["item"] = items1153      if "item" not in word_entry:1154        continue1155      num_eff_items = 01156      for item in word_entry["item"]:1157        text = item["text"]1158        if regex.search(r" (of|for) +\"", text) and len(text) < 50:1159          continue1160        if (regex.search(r"\p{Latin}.*ã®.*(åæ°|è¤æ°|ç¾å¨|éå»|æ¯è¼|æä¸).*(å½¢|ç´|åè©)", text) and1161            len(text) < 30):1162          continue1163        num_eff_items += 11164      if num_eff_items == 0:1165        continue1166      prob = None1167      if phrase_prob_dbm:1168        prob = self.GetPhraseProb(phrase_prob_dbm, "en", word)1169        if stem.lower() != word.lower():1170          if word.endswith("ics"):1171            prob *= 1.11172          elif word.count(" "):1173            prob *= 0.51174          else:1175            prob *= 0.11176        word_entry["probability"] = "{:.7f}".format(prob).replace("0.", ".")1177        if self.min_prob_map:1178          has_good_label = False1179          for item in word_entry["item"]:1180            if item["label"] not in self.min_prob_map:1181              has_good_label = True1182              break1183          if not has_good_label:1184            new_items = []1185            for item in word_entry["item"]:1186              is_good_item = True1187              for label, min_prob in self.min_prob_map.items():1188                if item["label"] == label:1189                  if is_keyword:1190                    min_prob *= 0.11191                  if is_super_keyword:1192                    norm_text = tkrzw_dict.NormalizeWord(item["text"])1193                    norm_text = regex.sub(r"^(to|a|an|the) +([\p{Latin}])", r"\2", norm_text)1194                    dist = tkrzw.Utility.EditDistanceLev(key, norm_text)1195                    dist /= max(len(key), len(norm_text))1196                    if dist > 0.5 or word in aux_trans or (core and core in aux_trans):1197                      min_prob = 0.01198                  if prob < min_prob:1199                    is_good_item = False1200              if is_good_item:1201                new_items.append(item)1202            word_entry["item"] = new_items1203      if not word_entry.get("item"):1204        continue1205      share_ratio = share / share_sum1206      if share_ratio < 1:1207        word_entry["share"] = "{:.3f}".format(share_ratio).replace("0.", ".")1208      uniq_alternatives = set()1209      scored_alternatives = []1210      for label, entry in entries:1211        alternatives = entry.get("alternative")1212        if alternatives:1213          for alternative in alternatives:1214            norm_alt = tkrzw_dict.NormalizeWord(alternative)1215            if norm_alt == key: continue1216            if label not in self.core_labels:1217              dist = tkrzw.Utility.EditDistanceLev(key, norm_alt)1218              dist_ratio = dist / max(len(key), len(norm_alt))1219              if dist > 4 or dist_ratio > 0.3: continue1220            if alternative not in uniq_alternatives:1221              alt_prob = self.GetPhraseProb(phrase_prob_dbm, "en", alternative)1222              scored_alternatives.append((alternative, alt_prob))1223              uniq_alternatives.add(alternative)1224      if scored_alternatives:1225        scored_alternatives = sorted(scored_alternatives, key=lambda x: x[1], reverse=True)1226        word_entry["alternative"] = [x[0] for x in scored_alternatives]1227      word_synonyms = synonyms[word]1228      if word_synonyms:1229        word_entry["_synonym"] = list(word_synonyms)1230      merged_entry.append(word_entry)1231    return merged_entry1232  def GetPhraseProb(self, prob_dbm, language, word):1233    base_prob = 0.0000000011234    tokens = self.tokenizer.Tokenize(language, word, False, True)1235    if not tokens: return base_prob1236    max_ngram = min(3, len(tokens))1237    fallback_penalty = 1.01238    for ngram in range(max_ngram, 0, -1):1239      if len(tokens) <= ngram:1240        cur_phrase = " ".join(tokens)1241        prob = float(prob_dbm.GetStr(cur_phrase) or 0.0)1242        if prob:1243          return max(prob, base_prob)1244        fallback_penalty *= 0.11245      else:1246        probs = []1247        index = 01248        miss = False1249        while index <= len(tokens) - ngram:1250          cur_phrase = " ".join(tokens[index:index + ngram])1251          cur_prob = float(prob_dbm.GetStr(cur_phrase) or 0.0)1252          if not cur_prob:1253            miss = True1254            break1255          probs.append(cur_prob)1256          index += 11257        if not miss:1258          inv_sum = 01259          for cur_prob in probs:1260            inv_sum += 1 / cur_prob1261          prob = len(probs) / inv_sum1262          prob *= 0.3 ** (len(tokens) - ngram)1263          prob *= fallback_penalty1264          return max(prob, base_prob)1265        fallback_penalty *= 0.11266    return base_prob1267  def SetAOA(self, word_entry, entries, aoa_words, live_words, phrase_prob_dbm):1268    word = word_entry["word"]1269    phrase_prob = min(float(word_entry.get("probability") or 0), 0.0000001)1270    share = float(word_entry.get("share") or 1)1271    share_bias = 0.01272    if share < 0.5:1273      share_bias = (0.5 - share) * 41274    aoa = aoa_words.get(word)1275    if aoa:1276      aoa += share_bias1277      word_entry["aoa"] = "{:.3f}".format(aoa)1278    concepts = set()1279    for label, entry in entries:1280      stems = entry.get("stem")1281      if stems:1282        for stem in stems:1283          concepts.add(stem)1284      core = entry.get("core")1285      if core:1286        concepts.add(core)1287    min_aoa = sys.maxsize1288    for concept in concepts:1289      if not live_words.Get(concept):1290        continue1291      aoa = aoa_words.get(concept)1292      if aoa:1293        if phrase_prob and phrase_prob_dbm:1294          concept_prob = self.GetPhraseProb(phrase_prob_dbm, "en", concept)1295          diff = max(math.log(concept_prob) - math.log(phrase_prob), 0.0)1296          aoa += min(diff * 1.0, 1.0)1297        else:1298          aoa += 1.01299        min_aoa = min(min_aoa, aoa)1300    if min_aoa < sys.maxsize:1301      min_aoa += share_bias1302      word_entry["aoa_concept"] = "{:.3f}".format(min_aoa)1303    bases = set()1304    for label, entry in entries:1305      tmp_bases = entry.get("base")1306      if tmp_bases:1307        for base in tmp_bases:1308          bases.add(base)1309    stem = " ".join(self.tokenizer.Tokenize("en", word, False, True))1310    if stem != word:1311      bases.add(stem)1312    min_aoa = sys.maxsize1313    for base in bases:1314      if not live_words.Get(base):1315        continue1316      aoa = aoa_words.get(base)1317      if aoa:1318        aoa += 1.01319        min_aoa = min(min_aoa, aoa)1320    if min_aoa < sys.maxsize:1321      min_aoa += share_bias1322      word_entry["aoa_base"] = "{:.3f}".format(min_aoa)1323  def ExtractTextLabelTrans(self, text):1324    trans = []1325    match = regex.search(r"\[translation\]: ", text)1326    if match:1327      text = text[match.end():]1328      text = regex.sub(r"\[-.*", "", text)1329      text = regex.sub(r"\(.*?\)", "", text)1330      for tran in text.split(","):1331        tran = unicodedata.normalize('NFKC', tran)1332        tran = tran.strip()1333        tran = regex.sub(r"[\p{S}\p{P}]+ *(ã|ã®|ã|ã«|ã¸|ã¨|ãã|ãã|ã§|ã)", "", tran)1334        tran = regex.sub(r"[ï½\p{S}\p{P}]", " ", tran)1335        tran = regex.sub(r" +(?=[\p{Han}\p{Hiragana}\p{Katakana}ã¼])", "", tran)1336        tran = regex.sub(r"[\p{Z}]+", " ", tran).strip()1337        if tran:1338          trans.append(tran)1339    return trans1340  def SetTranslations(self, entry, aux_trans, tran_prob_dbm, rev_prob_dbm):1341    word = entry["word"]1342    tran_probs = {}1343    if tran_prob_dbm:1344      key = tkrzw_dict.NormalizeWord(word)1345      tsv = tran_prob_dbm.GetStr(key)1346      if tsv:1347        fields = tsv.split("\t")1348        extra_records = []1349        for i in range(0, len(fields), 3):1350          src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])1351          if regex.search("[ã£ã]$", trg) and self.tokenizer.GetJaLastPos(trg)[1] == "åè©":1352            continue1353          if src != word:1354            prob *= 0.11355          norm_trg = tkrzw_dict.NormalizeWord(trg)1356          if tkrzw_dict.IsStopWord("ja", norm_trg):1357            prob *= 0.71358          elif len(norm_trg) < 2:1359            prob *= 0.91360          prob **= 0.81361          tran_probs[norm_trg] = max(tran_probs.get(norm_trg) or 0.0, prob)1362          stem_trg = regex.sub(1363            r"([\p{Han}\p{Katakana}ã¼]{2,})(ãã|ãããã¨|ããã|ããããã¨|ããã)$",1364            r"\1", norm_trg)1365          if stem_trg != norm_trg:1366            extra_records.append((stem_trg, prob * 0.5))1367          stem_trg = self.tokenizer.CutJaWordNounParticle(norm_trg)1368          if stem_trg != norm_trg:1369            extra_records.append((stem_trg, prob * 0.5))1370          stem_trg = regex.sub(r"([\p{Han}\p{Katakana}ã¼]{2,})(ç|çãª|çã«)$", r"\1", norm_trg)1371          if stem_trg != norm_trg:1372            extra_records.append((stem_trg, prob * 0.5))1373          if self.tokenizer.IsJaWordSahenNoun(norm_trg):1374            long_trg = norm_trg + "ãã"1375            extra_records.append((long_trg, prob * 0.5))1376        for extra_trg, extra_prob in extra_records:1377          tran_probs[extra_trg] = max(tran_probs.get(extra_trg) or 0.0, extra_prob)1378    word_aux_trans = aux_trans.get(word)1379    count_aux_trans = {}1380    if word_aux_trans:1381      for aux_tran in word_aux_trans:1382        count = (count_aux_trans.get(aux_tran) or 0) + 11383        count_aux_trans[aux_tran] = count1384      aux_weight = 1.01385      extra_records = []1386      for aux_tran, count in count_aux_trans.items():1387        aux_score = (0.01 ** (1 / (count + 1))) * aux_weight1388        prob = (tran_probs.get(aux_tran) or 0) + aux_score1389        tran_probs[aux_tran] = prob1390        stem_tran = regex.sub(1391          r"([\p{Han}\p{Katakana}ã¼]{2,})(ãã|ãããã¨|ããã|ããããã¨|ããã)$",1392          r"\1", aux_tran)1393        if stem_tran != aux_tran:1394          extra_records.append((stem_tran, aux_score * 0.5))1395        stem_tran = self.tokenizer.CutJaWordNounParticle(aux_tran)1396        if stem_tran != aux_tran:1397          extra_records.append((stem_tran, aux_score * 0.5))1398        stem_tran = regex.sub(r"([\p{Han}\p{Katakana}ã¼]{2,})(ç|çãª|çã«)$", r"\1", aux_tran)1399        if stem_tran != aux_tran:1400          extra_records.append((stem_tran, aux_score * 0.5))1401        if self.tokenizer.IsJaWordSahenNoun(aux_tran):1402          long_tran = aux_tran + "ãã"1403          extra_records.append((long_tran, aux_score * 0.5))1404        aux_weight *= 0.91405      for extra_tran, extra_prob in extra_records:1406        tran_probs[extra_tran] = max(tran_probs.get(extra_tran) or 0.0, extra_prob)1407    translations = {}1408    tran_labels = {}1409    def Vote(tran, weight, label):1410      if regex.search(r"^(noun|verb|adj|adv|[0-9])[^\p{Latin}]", tran):1411        return1412      norm_tran = tkrzw_dict.NormalizeWord(tran)1413      score = 0.000011414      if rev_prob_dbm:1415        prob = self.GetPhraseProb(rev_prob_dbm, "ja", tran)1416        prob = max(prob, 0.0000001)1417        prob = math.exp(-abs(math.log(0.001) - math.log(prob))) * 0.11418        if tkrzw_dict.IsStopWord("ja", tran) or tran in ("åã¯"):1419          prob *= 0.51420        score += prob1421      score *= weight1422      old_score = translations.get(tran) or 0.01423      translations[tran] = max(old_score, score)1424      if label:1425        old_labels = tran_labels.get(norm_tran) or set()1426        old_labels.add(label)1427        tran_labels[norm_tran] = old_labels1428    body_weight = 1.01429    tran_weight = 0.71430    for item in entry["item"]:1431      label = item["label"]1432      pos = item["pos"]1433      sections = item["text"].split(" [-] ")1434      text = sections[0]1435      text = regex.sub(r"ã *(ã¾ã|ã¾ãã¯|åã¯)ã.*?ã", r"ã", text)1436      if (label in self.gross_labels and1437          regex.search(r"[\p{Han}\p{Hiragana}\p{Katakana}ã¼]", text)):1438        weight = body_weight1439        body_weight *= 0.91440        if regex.search(r"[\(ï¼ãã\{\(]([^)ï¼ãã\}\]]+[ã»ã])?" +1441                        r"(ä¿|ä¿èª|ã¹ã©ã³ã°|å|åèª|é èª|å¤|å¤èª|å»|å»ç¨|å»èª)+[)ï¼ãã\}\]]",1442                        text):1443          weight *= 0.11444        text = regex.sub(r"[\(ï¼ãã\{\(].*?[)ï¼ãã\}\]]", "ã", text)1445        text = regex.sub(r"[ï½¥ã»]", "", text)1446        text = regex.sub(r"\p{Z}+", " ", text).strip()1447        if regex.search(1448            r"ã®(ç´æ¥æ³|ç´èª¬æ³|ä»®å®æ³)?(ç¾å¨|éå»)?(ç¬¬?[ä¸äºä¸]äººç§°)?[ ã»ï½¥ã]?" +1449            r"(åæ°|è¤æ°|ç¾å¨|éå»|æ¯è¼|æä¸|é²è¡|å®äº|ååè©|åç´|ç¸®ç´)+[ ã»ï½¥ã]?" +1450            r"(å½¢|å|åè©|ç´|ååè©|åè©|åè©|å½¢å®¹è©|å¯è©)+", text):1451          continue1452        if regex.search(r"ã®(ç´æ¥æ³|ç´èª¬æ³|ä»®å®æ³)(ç¾å¨|éå»)", text):1453          continue1454        if regex.search(r"ã®(ååè©|ç°ç¶´|ç°ä½|ç°å½¢|å¤èª|ç¥|çç¥|çç¸®|é åèª)", text):1455          continue1456        if regex.search(r"ãã®ä»ã[^ãã]{12,}", text):1457          continue1458        text = regex.sub(r" \[-+\] .*", "", text).strip()1459        text = regex.sub(r" -+ .*", "", text).strip()1460        for tran in regex.split("[ã|ã|ï¼|,|;]", text):1461          if len(translations) > 1:1462            if tran in ("ã¾ã", "ã¾ãã¯", "åã¯", "ãã°ãã°"):1463              continue1464          if regex.search(r"^[ \p{Latin}]+ã", tran):1465            continue1466          tran = regex.sub(r"^[\p{S}\p{P}]+ *(ã|ã®|ã|ã«|ã¸|ã¨|ãã|ãã|ã§|ã)", "", tran)1467          tran = regex.sub(r"[ï½ã]", "", tran)1468          tokens = self.tokenizer.Tokenize("ja", tran, False, False)1469          if len(tokens) > 6:1470            break1471          if regex.search(r"^[ \p{Latin}]+ *ãªã©", tran):1472            continue1473          if regex.search(r"[\p{Latin}].*ã®.*(è©å½¢|ç¶´ã)$", tran):1474            continue1475          tran = " ".join(tokens)1476          tran = regex.sub(r"([\p{Han}\p{Hiragana}\p{Katakana}ã¼]) +", r"\1", tran)1477          tran = regex.sub(r" +([\p{Han}\p{Hiragana}\p{Katakana}ã¼])", r"\1", tran)1478          tran = regex.sub(r"[\p{Z}]+", " ", tran).strip()1479          if tran:1480            Vote(tran, weight, label)1481            weight *= 0.81482      if label in self.tran_list_labels:1483        for section in sections:1484          trans = self.ExtractTextLabelTrans(section)1485          if not trans: continue1486          weight = tran_weight1487          tran_weight *= 0.91488          uniq_trans = set()1489          for tran in trans:1490            norm_tran = self.tokenizer.NormalizeJaWordForPos(pos, tran)1491            if norm_tran and norm_tran not in uniq_trans:1492              Vote(norm_tran, weight, label)1493              weight *= 0.81494              uniq_trans.add(norm_tran)1495      if label in self.supplement_labels:1496        text = sections[0]1497        uniq_trans = set()1498        for tran in regex.split("[;,]", text):1499          norm_tran = self.tokenizer.NormalizeJaWordForPos(pos, tran.strip())1500          if norm_tran and norm_tran not in uniq_trans:1501            Vote(norm_tran, 0.01, "")1502    pos_scores = {}1503    pos_base_score = 1.01504    for item in entry["item"]:1505      pos = item["pos"]1506      score = pos_base_score1507      if item["label"] not in self.core_labels:1508        score *= 0.751509      pos_scores[pos] = (pos_scores.get(pos) or 0.0) + score1510      pos_base_score *= 0.91511    pos_sum_score = 0.0011512    for pos, score in pos_scores.items():1513      pos_sum_score += score1514    pure_noun = (pos_scores.get("noun") or 0.0) / pos_sum_score >= 0.91515    pure_verb = (pos_scores.get("verb") or 0.0) / pos_sum_score >= 0.91516    pure_adjective = (pos_scores.get("adjective") or 0.0) / pos_sum_score >= 0.91517    pure_adverb = (pos_scores.get("adverb") or 0.0) / pos_sum_score >= 0.91518    bonus_translations = []1519    scored_translations = set()1520    for tran, score in translations.items():1521      tran = unicodedata.normalize('NFKC', tran)1522      norm_tran = tkrzw_dict.NormalizeWord(tran)1523      prob = tran_probs.get(norm_tran)1524      if prob:1525        if len(norm_tran) < 2:1526          prob *= 0.51527        score += prob1528        del tran_probs[norm_tran]1529        scored_translations.add(norm_tran)1530      bonus_translations.append((tran, score))1531    sorted_translations = []1532    for tran, score in bonus_translations:1533      norm_tran = tkrzw_dict.NormalizeWord(tran)1534      if norm_tran not in scored_translations:1535        bonus = 0.01536        for dict_tran, prob in tran_probs.items():1537          if len(dict_tran) >= 2 and norm_tran.startswith(dict_tran):1538            bonus = max(bonus, prob * 0.3)1539          elif len(norm_tran) >= 2 and dict_tran.startswith(norm_tran):1540            bonus = max(bonus, prob * 0.2)1541          elif len(dict_tran) >= 2 and norm_tran.find(dict_tran) >= 0:1542            bonus = max(bonus, prob * 0.1)1543          elif len(norm_tran) >= 2 and dict_tran.find(norm_tran) >= 0:1544            bonus = max(bonus, prob * 0.1)1545        score += bonus1546      if norm_tran in tran_labels:1547        score += (len(tran_labels[norm_tran]) - 1) * 0.0011548      tran_pos = self.tokenizer.GetJaLastPos(tran)1549      if pure_noun:1550        if tran_pos[1] == "åè©" and regex.search(r"\p{Han}", tran):1551          score *= 1.21552      if pure_verb:1553        if tran_pos[1] == "åè©":1554          if regex.search("[ãããã¤ã¬ãµããã]$", tran):1555            score *= 1.31556        elif self.tokenizer.IsJaWordSahenNoun(tran):1557          score *= 1.21558      if pure_adjective:1559        tran_pos = self.tokenizer.GetJaLastPos(tran)1560        if tran_pos[1] == "å½¢å®¹è©" or self.tokenizer.IsJaWordAdjvNoun(tran):1561          score *= 1.21562      if (pure_verb or pure_adjective or pure_adverb):1563        if len(tran) <= 1:1564          score *= 0.81565        if regex.search(r"[\p{Katakana}]", tran):1566          score *= 0.71567          if regex.fullmatch(r"[\p{Katakana}ã¼]+", tran):1568            score *= 0.71569        elif regex.fullmatch(r"[\p{Hiragana}ã¼]+", tran):1570          score *= 0.91571        elif not regex.search(r"[\p{Han}\p{Hiragana}\p{Katakana}]+", tran):1572          score *= 0.71573      else:1574        if regex.search(r"[\p{Katakana}]", tran):1575          score *= 0.81576          if regex.fullmatch(r"[\p{Katakana}ã¼]+", tran):1577            score *= 0.81578        elif regex.fullmatch(r"[\p{Hiragana}ã¼]+", tran):1579          score *= 0.951580        elif not regex.search(r"[\p{Han}\p{Hiragana}\p{Katakana}]+", tran):1581          score *= 0.81582      sorted_translations.append((tran, score))1583    sorted_translations = sorted(sorted_translations, key=lambda x: x[1], reverse=True)1584    deduped_translations = []1585    for tran, score in sorted_translations:1586      norm_tran = tkrzw_dict.NormalizeWord(tran)1587      bias = 1.01588      for prev_tran, prev_score in deduped_translations:1589        if len(prev_tran) >= 2 and norm_tran.startswith(prev_tran):1590          bias = min(bias, 0.4 if len(prev_tran) >= 2 else 0.6)1591        elif len(norm_tran) >= 2 and prev_tran.startswith(norm_tran):1592          bias = min(bias, 0.6 if len(norm_tran) >= 2 else 0.7)1593        elif len(prev_tran) >= 2 and norm_tran.find(prev_tran) >= 0:1594          bias = min(bias, 0.8 if len(prev_tran) >= 2 else 0.9)1595        elif len(norm_tran) >= 2 and prev_tran.find(norm_tran) >= 0:1596          bias = min(bias, 0.8 if len(norm_tran) >= 2 else 0.9)1597        dist = tkrzw.Utility.EditDistanceLev(norm_tran, prev_tran)1598        dist /= max(len(norm_tran), len(prev_tran))1599        if dist < 0.3:1600          bias = min(bias, dist + 0.2)1601      score *= bias1602      deduped_translations.append((tran, score))1603    deduped_translations = sorted(deduped_translations, key=lambda x: x[1], reverse=True)1604    uniq_trans = set()1605    final_translations = []1606    max_elems = int(min(max(math.log2(len(entry["item"])), 2), 8) * 8)1607    for tran, score in deduped_translations:1608      tran = regex.sub(r"^ã.*", "", tran)1609      tran = regex.sub(r"ã»", "", tran)1610      norm_tran = tkrzw_dict.NormalizeWord(tran)1611      if not norm_tran or norm_tran in uniq_trans:1612        continue1613      uniq_trans.add(norm_tran)1614      match = regex.search("(.*)(ããã|ããã|ãã)$", norm_tran)1615      if match:1616        uniq_trans.add(match.group(1) + "ãã")1617        uniq_trans.add(match.group(1) + "ããã")1618        uniq_trans.add(match.group(1) + "ããã")1619      if len(final_translations) < max_elems or score >= 0.001:1620        final_translations.append(tran)1621    sorted_aux_trans = sorted(count_aux_trans.items(), key=lambda x: -x[1])1622    for aux_tran, count in sorted_aux_trans:1623      aux_tran = regex.sub(r"^ã.*", "", aux_tran)1624      aux_tran = regex.sub(r"ã»", "", aux_tran)1625      if pure_noun:1626        aux_tran = self.MakeTranNoun(aux_tran)1627      if pure_verb:1628        aux_tran = self.MakeTranVerb(aux_tran)1629      if pure_adjective:1630        aux_tran = self.MakeTranAdjective(aux_tran)1631      if pure_adverb:1632        aux_tran = self.MakeTranAdverb(aux_tran)1633      if len(final_translations) >= max_elems: break1634      norm_tran = tkrzw_dict.NormalizeWord(aux_tran)1635      if not norm_tran or norm_tran in uniq_trans:1636        continue1637      uniq_trans.add(norm_tran)1638      final_translations.append(aux_tran)1639    if final_translations:1640      entry["translation"] = final_translations1641  def SetRelations(self, word_entry, entries, word_dicts, live_words, rev_live_words,1642                   phrase_prob_dbm, tran_prob_dbm, cooc_prob_dbm, extra_word_bases,1643                   verb_words, adj_words, adv_words):1644    word = word_entry["word"]1645    norm_word = tkrzw_dict.NormalizeWord(word)1646    scores = {}1647    def Vote(rel_word, label, weight):1648      values = scores.get(rel_word) or []1649      values.append((weight, label))1650      scores[rel_word] = values1651    synonyms = word_entry.get("_synonym")1652    if synonyms:1653      for synonym in synonyms:1654        Vote(synonym, "meta", 0.1)1655    parents = set()1656    children = set()1657    for label, entry in entries:1658      stems = entry.get("stem")1659      if stems:1660        for stem in stems:1661          parents.add(stem)1662      stem_children = entry.get("stem_child")1663      if stem_children:1664        for child in stem_children:1665          children.add(child)1666      core = entry.get("core")1667      if core:1668        parents.add(core)1669      core_children = entry.get("core_child")1670      if core_children:1671        for child in core_children:1672          children.add(child)1673      bases = entry.get("base")1674      if bases:1675        for base in bases:1676          parents.add(base)1677      base_children = entry.get("base_child")1678      if base_children:1679        for child in base_children:1680          children.add(child)1681      for rel_name, rel_weight in rel_weights.items():1682        ent_rel_words = []1683        expr = entry.get(rel_name)1684        if expr:1685          for rel_word in expr.split(","):1686            rel_word = rel_word.strip()1687            ent_rel_words.append(rel_word)1688          if ent_rel_words:1689            scored_rel_words = []1690            for i, rel_word in enumerate(ent_rel_words):1691              weight = 30 / (min(i, 30) + 30)1692              weight *= rel_weight1693              Vote(rel_word, label, weight)1694        texts = entry.get("text")1695        if texts:1696          base_weight = 1.11697          for text in texts:1698            for field in text[1].split(" [-] "):1699              if not field.startswith("[" + rel_name + "]: "): continue1700              field = regex.sub(r"^[^:]+: ", "", field)1701              field = regex.sub(r"\(.*?\) *", "", field)1702              for i, rel_word in enumerate(field.split(",")):1703                rel_word = rel_word.strip()1704                if rel_word:1705                  weight = 30 / (min(i, 30) + 30)1706                  weight *= rel_weight * base_weight1707                  Vote(rel_word, label, weight)1708            base_weight *= 0.951709    extra_word_base = extra_word_bases.get(word)1710    if extra_word_base:1711      parents.add(extra_word_base)1712    alternatives = word_entry.get("alternative")1713    if alternatives:1714      for alternative in alternatives:1715        if word not in force_parents:1716          parents.discard(alternative)1717        if alternative not in force_parents:1718          children.discard(alternative)1719    for variant in self.GetSpellVariants(word):1720      if word not in force_parents:1721        parents.discard(variant)1722      if variant not in force_parents:1723        children.discard(variant)1724    for child in children:1725      parents.discard(child)1726    if word in no_parents:1727      parents.clear()1728    force_parent = force_parents.get(word)1729    if force_parent:1730      parents.clear()1731      parents.add(force_parent)1732    parents = set([x for x in parents if force_parents.get(x) != word])1733    children = set([x for x in children if x not in no_parents])1734    translations = list(word_entry.get("translation") or [])1735    if tran_prob_dbm:1736      tsv = tran_prob_dbm.GetStr(norm_word)1737      if tsv:1738        fields = tsv.split("\t")1739        for i in range(0, len(fields), 3):1740          src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])1741          translations.append(trg)1742    translations = set([tkrzw_dict.NormalizeWord(x) for x in translations])1743    rel_words = []1744    for rel_word, votes in scores.items():1745      norm_rel_word = tkrzw_dict.NormalizeWord(rel_word)1746      label_weights = {}1747      for weight, label in votes:1748        old_weight = label_weights.get(label) or 0.01749        label_weights[label] = max(old_weight, weight)1750      total_weight = 01751      for label, weight in label_weights.items():1752        total_weight += weight1753      if tran_prob_dbm:1754        tsv = tran_prob_dbm.GetStr(norm_rel_word)1755        if tsv:1756          bonus = 0.01757          fields = tsv.split("\t")1758          for i in range(0, len(fields), 3):1759            src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])1760            norm_tran = tkrzw_dict.NormalizeWord(trg)1761            for dict_tran in translations:1762              if dict_tran == norm_tran:1763                bonus = max(bonus, 1.0)1764              elif len(dict_tran) >= 2 and norm_tran.startswith(dict_tran):1765                bonus = max(bonus, 0.3)1766              elif len(norm_tran) >= 2 and dict_tran.startswith(norm_tran):1767                bonus = max(bonus, 0.2)1768              elif len(dict_tran) >= 2 and norm_tran.find(dict_tran) >= 0:1769                bonus = max(bonus, 0.1)1770              elif len(norm_tran) >= 2 and dict_tran.find(norm_tran) >= 0:1771                bonus = max(bonus, 0.1)1772              dist = tkrzw.Utility.EditDistanceLev(dict_tran, norm_tran)1773              dist /= max(len(dict_tran), len(norm_tran))1774              if dist < 0.3:1775                bonus = max(bonus, 0.3)1776          total_weight += bonus1777      score = 1.01778      if phrase_prob_dbm:1779        prob = self.GetPhraseProb(phrase_prob_dbm, "en", rel_word)1780        prob = max(prob, 0.0000001)1781        score += math.exp(-abs(math.log(0.001) - math.log(prob))) * 0.11782      score *= total_weight1783      if tkrzw_dict.IsStopWord("en", norm_rel_word):1784        if tkrzw_dict.IsStopWord("en", norm_word):1785          score *= 0.31786        else:1787          score *= 0.11788      rel_words.append((rel_word, score))1789    rel_words = sorted(rel_words, key=lambda x: x[1], reverse=True)1790    uniq_words = set()1791    final_rel_words = []1792    for rel_word, score in rel_words:1793      if rel_word in parents or rel_word in children:1794        continue1795      if not live_words.Get(rel_word) or rel_word == word:1796        continue1797      norm_rel_word = tkrzw_dict.NormalizeWord(rel_word)1798      if not norm_rel_word: continue1799      if norm_rel_word in uniq_words: continue1800      uniq_words.add(norm_rel_word)1801      hit = False1802      for label, word_dict in word_dicts:1803        if label in self.surfeit_labels: continue1804        if norm_rel_word in word_dict:1805          hit = True1806          break1807      if not hit: continue1808      final_rel_words.append(rel_word)1809    if final_rel_words:1810      max_elems = int(min(max(math.log2(len(word_entry["item"])), 2), 6) * 6)1811      word_entry["related"] = final_rel_words[:max_elems]1812    scored_parents = []1813    for parent in parents:1814      if not live_words.Get(parent) or parent == word:1815        continue1816      prob = self.GetPhraseProb(phrase_prob_dbm, "en", parent)1817      scored_parents.append((parent, prob))1818    scored_parents = sorted(scored_parents, key=lambda x: x[1], reverse=True)1819    #print("SP", word, scored_parents)1820    1821    if scored_parents:1822      word_entry["parent"] = [x[0] for x in scored_parents]1823    scored_children = []1824    for child in children:1825      if not live_words.Get(child) or child == word or child in parents:1826        continue1827      prob = self.GetPhraseProb(phrase_prob_dbm, "en", child)1828      scored_children.append((child, prob))1829    scored_children = sorted(scored_children, key=lambda x: x[1], reverse=True)1830    if scored_children:1831      word_entry["child"] = [x[0] for x in scored_children]1832    prob = float(live_words.GetStr(word) or 0.0)1833    if prob >= 0.000001 and regex.fullmatch(r"[-\p{Latin}]+", word):1834      prefix = word + " "1835      idioms = []1836      it = live_words.MakeIterator()1837      it.Jump(prefix)1838      while True:1839        rec = it.GetStr()1840        if not rec: break1841        cmp_word, cmp_prob = rec1842        if not cmp_word.startswith(prefix): break1843        cmp_prob = float(cmp_prob)1844        cmp_score = cmp_prob / prob1845        if cmp_score >= 0.001:1846          has_particle = False1847          for cmp_token in cmp_word.split(" ")[1:]:1848            if cmp_token in particles:1849              has_particle = True1850              break1851          if has_particle:1852            cmp_score *= 3.01853          if cmp_word in verb_words or cmp_word in adj_words or cmp_word in adv_words:1854            cmp_score *= 3.01855          idioms.append((cmp_word, cmp_score))1856        it.Next()1857      it = rev_live_words.MakeIterator()1858      it.Jump(prefix)1859      while True:1860        rec = it.GetStr()1861        if not rec: break1862        cmp_word, cmp_prob = rec1863        if not cmp_word.startswith(prefix): break1864        cmp_word = " ".join(reversed(cmp_word.split(" ")))1865        cmp_prob = float(cmp_prob)1866        cmp_score = cmp_prob / prob1867        if cmp_score >= 0.001:1868          has_particle = False1869          for cmp_token in cmp_word.split(" ")[:-1]:1870            if cmp_token in particles:1871              has_particle = True1872              break1873          if has_particle:1874            cmp_score *= 3.01875          if cmp_word in verb_words or cmp_word in adj_words or cmp_word in adv_words:1876            cmp_score *= 3.01877          cmp_score * 0.91878          idioms.append((cmp_word, cmp_score))1879        it.Next()1880      idioms = sorted(idioms, key=lambda x: x[1], reverse=True)1881      uniq_idioms = set()1882      final_idioms = []1883      for idiom, prob in idioms:1884        if idiom in uniq_idioms: continue1885        uniq_idioms.add(idiom)1886        final_idioms.append(idiom)1887      if final_idioms:1888        max_elems = int(min(max(math.log2(len(word_entry["item"])), 2), 6) * 4)1889        word_entry["idiom"] = final_idioms[:max_elems]1890  def SetCoocurrences(self, word_entry, entries, word_dicts, phrase_prob_dbm, cooc_prob_dbm):1891    word = word_entry["word"]1892    norm_word = tkrzw_dict.NormalizeWord(word)1893    tokens = self.tokenizer.Tokenize("en", word, True, True)1894    cooc_words = collections.defaultdict(float)1895    max_word_weight = 0.01896    for token in tokens:1897      phrase_prob = self.GetPhraseProb(phrase_prob_dbm, "en", token)1898      word_idf = math.log(phrase_prob) * -11899      word_weight = word_idf ** 21900      max_word_weight = max(max_word_weight, word_weight)1901      tsv = cooc_prob_dbm.GetStr(token)1902      if tsv:1903        for field in tsv.split("\t")[:32]:1904          cooc_word, cooc_prob = field.split(" ", 1)1905          cooc_tokens = self.tokenizer.Tokenize("en", cooc_word, True, True)1906          for cooc_token in cooc_tokens:1907            if cooc_token and cooc_word not in tokens:1908              cooc_words[cooc_token] += float(cooc_prob) * word_weight1909    def_token_labels = collections.defaultdict(set)1910    for item in word_entry["item"]:1911      label = item["label"]1912      if label not in self.full_def_labels: continue1913      text = item["text"]1914      text = regex.sub(r" \[-.*", "", text).strip()1915      if regex.search(r"^\[-.*", text): continue1916      text = regex.sub(r"\(.*?\)", "", text)1917      text = regex.sub(r"\[.*?\]", "", text)1918      if not text: continue1919      def_tokens = self.tokenizer.Tokenize("en", text, True, True)1920      for def_token in def_tokens:1921        if not regex.fullmatch(r"[\p{Latin}]{2,}", def_token): continue1922        if def_token in particles or def_token in misc_stop_words: continue1923        if def_token in tokens: continue1924        def_token_labels[def_token].add(label)1925    for def_token, labels in def_token_labels.items():1926      cooc_words[def_token] += 0.01 * len(labels) * max_word_weight1927    is_wiki_word = "wikipedia" in cooc_words or "encyclopedia" in cooc_words1928    merged_cooc_words = sorted(cooc_words.items(), key=lambda x: x[1], reverse=True)1929    weighed_cooc_words = []1930    for cooc_word, cooc_score in merged_cooc_words:1931      cooc_prob = self.GetPhraseProb(phrase_prob_dbm, "en", cooc_word)1932      cooc_idf = math.log(cooc_prob) * -11933      cooc_score *= cooc_idf ** 21934      if tkrzw_dict.IsStopWord("en", cooc_word):1935        if tkrzw_dict.IsStopWord("en", norm_word):1936          cooc_score *= 0.31937        else:1938          cooc_score *= 0.11939      elif cooc_word in particles or cooc_word in misc_stop_words:1940        cooc_score *= 0.51941      elif is_wiki_word and cooc_word in wiki_stop_words:1942        cooc_score *= 0.21943      weighed_cooc_words.append((cooc_word, cooc_score))1944    sorted_cooc_words = sorted(weighed_cooc_words, key=lambda x: x[1], reverse=True)1945    final_cooc_words = []1946    for cooc_word, cooc_score in sorted_cooc_words:1947      if len(final_cooc_words) >= 16: break1948      hit = False1949      for label, word_dict in word_dicts:1950        if label in self.surfeit_labels: continue1951        if cooc_word in word_dict:1952          hit = True1953          break1954      if not hit: continue1955      final_cooc_words.append(cooc_word)1956    if final_cooc_words:1957      word_entry["cooccurrence"] = final_cooc_words1958  def CompensateInflections(self, entry, merged_dict, verb_words):1959    word = entry["word"]1960    root_verb = None1961    ing_value = entry.get("verb_present_participle")1962    if ing_value and ing_value.endswith("<ing"):1963      root_verb = ing_value[:-4]1964    for infl_name in inflection_names:1965      value = entry.get(infl_name)1966      if value and not regex.fullmatch(r"[-\p{Latin}0-9', ]+", value):1967        del entry[infl_name]1968    poses = set()1969    for item in entry["item"]:1970      poses.add(item["pos"])1971    if "verb" in poses and word.find(" ") >= 0 and not regex.search(r"[A-Z]", word):1972      tokens = self.tokenizer.Tokenize("en", word, False, False)1973      if len(tokens) > 1:1974        if not root_verb:1975          for token in tokens:1976            if token not in particles and token not in misc_stop_words and token in verb_words:1977              root_verb = token1978              break1979        if root_verb:1980          root_entry = merged_dict.get(root_verb)1981          if root_entry:1982            for infl_name in inflection_names:1983              if not infl_name.startswith("verb_") or entry.get(infl_name):1984                continue1985              root_infls = root_entry[0].get(infl_name)1986              if not root_infls:1987                continue1988              phrase_infls = []1989              for root_infl in regex.split(r"[,|]", root_infls):1990                root_infl = root_infl.strip()1991                if not root_infl: continue1992                root_infl_tokens = []1993                for token in tokens:1994                  if root_infl and token == root_verb:1995                    root_infl_tokens.append(root_infl)1996                    root_infl = None1997                  else:1998                    root_infl_tokens.append(token)1999                phrase_infls.append(" ".join(root_infl_tokens))2000              if phrase_infls:2001                entry[infl_name] = ", ".join(phrase_infls)2002  def CompensateAlternatives(self, word_entry, merged_dict):2003    word = word_entry["word"]2004    alternatives = word_entry.get("alternative") or []2005    variants = self.GetSpellVariants(word)2006    wn_count = 02007    for item in word_entry["item"]:2008      if item["label"] != "wn": continue2009      wn_count += 12010      for section in item["text"].split("[-]"):2011        section = section.strip()2012        match = regex.search(r"\[synonym\]: (.*)", section)2013        if match:2014          for synonym in match.group(1).split(","):2015            synonym = synonym.strip()2016            dist = tkrzw.Utility.EditDistanceLev(word, synonym)2017            similar = False2018            if dist == 1 and word[:3] != synonym[:3]:2019              similar = True2020            elif dist == 2 and word[:5] == synonym[:5] and word[-2:] == synonym[-2:]:2021              similar = True2022            if similar and synonym not in variants:2023              variants.add(synonym)2024    for variant in variants:2025      if word[:2] != variant[:2]: continue2026      if variant in alternatives: continue2027      variant_entries = merged_dict.get(variant)2028      if not variant_entries: continue2029      for variant_entry in variant_entries:2030        if variant_entry["word"] != variant: continue2031        var_wn_count = 02032        var_wn_counts = collections.defaultdict(int)2033        for item in variant_entry["item"]:2034          if item["label"] != "wn": continue2035          var_wn_count += 12036          for section in item["text"].split("[-]"):2037            section = section.strip()2038            match = regex.search(r"\[synonym\]: (.*)", section)2039            if match:2040              for synonym in match.group(1).split(","):2041                synonym = synonym.strip()2042                if synonym:2043                  var_wn_counts[synonym] += 12044        hits = var_wn_counts[word]2045        if (wn_count > 0 and var_wn_count == wn_count and hits == wn_count) or hits >= 4:2046          alternatives.append(variant)2047    if alternatives:2048      word_entry["alternative"] = alternatives2049  def GetSpellVariants(self, word):2050    variants = set()2051    suffix_pairs = [("se", "ze"), ("sing", "zing"), ("sed", "zed"), ("ser", "zer"),2052                    ("sation", "zation"), ("ce", "se"),2053                    ("our", "or"), ("og", "ogue"), ("re", "er"), ("l", "ll")]2054    for suffix1, suffix2 in suffix_pairs:2055      if word.endswith(suffix1):2056        variant = word[:-len(suffix1)] + suffix22057        variants.add(variant)2058      if word.endswith(suffix2):2059        variant = word[:-len(suffix2)] + suffix12060        variants.add(variant)2061    return variants2062  def GetEntryTranslations(self, merged_dict, word, is_capital, best_pos):2063    key = tkrzw_dict.NormalizeWord(word)2064    entry = merged_dict.get(key)2065    if not entry: return None2066    scored_trans = []2067    word_score = 1.02068    for word_entry in entry:2069      cmp_word = word_entry["word"]2070      if bool(regex.search(r"\p{Lu}", cmp_word)) != is_capital:2071        continue2072      item_score = 1.02073      for item in word_entry["item"]:2074        pos = item["pos"]2075        text = item["text"]2076        trans = self.ExtractTextLabelTrans(text)2077        if trans:2078          score = word_score * item_score2079          if pos == best_pos:2080            score *= 2.02081          for tran in trans:2082            scored_trans.append((tran, score))2083            score *= 0.92084          item_score *= 0.92085      trans = word_entry.get("translation")2086      if trans:2087        score = word_score * item_score2088        for tran in trans:2089          scored_trans.append((tran, score))2090          score *= 0.92091      word_score *= 0.52092    scored_trans = sorted(scored_trans, key=lambda x: x[1], reverse=True)2093    return [x[0] for x in scored_trans]2094  def PropagateTranslations(self, entry, merged_dict, tran_prob_dbm, aux_last_trans):2095    old_trans = entry.get("translation") or []2096    if len(old_trans) >= 8: return2097    word = entry["word"]2098    is_capital = bool(regex.search(r"\p{Lu}", word))2099    if len(word) <= 2: return2100    uniq_labels = set()2101    top_exprs = []2102    poses = set()2103    synonyms = []2104    for item in entry["item"]:2105      label = item["label"]2106      pos = item["pos"]2107      poses.add(pos)2108      if label in self.gross_labels or label in self.supplement_labels: continue2109      is_first = label not in uniq_labels2110      uniq_labels.add(label)2111      text = item["text"]2112      for field in text.split(" [-] "):2113        if not field.startswith("[synonym]: "): continue2114        field = regex.sub(r"^[^:]+: ", "", field)2115        field = regex.sub(r"\(.*?\) *", "", field)2116        for synonym in field.split(","):2117          synonym = synonym.strip()2118          if synonym:2119            synonyms.append((synonym, pos))2120      text = regex.sub(r" \[-+\] .*", "", text)2121      text = regex.sub(r"\(.*?\)", "", text)2122      text = regex.sub(r"\.$", "", text)2123      text = regex.sub(r"([-\p{Latin}\d]{5,})\.", r"\1;", text)2124      for expr in text.split(";"):2125        expr = expr.strip()2126        if pos == "verb":2127          expr = regex.sub(r"^to +([\p{Latin}])", r"\1", expr, flags=regex.IGNORECASE)2128        elif pos == "noun":2129          expr = regex.sub(r"^(a|an|the) +([\p{Latin}])", r"\2", expr, flags=regex.IGNORECASE)2130        if expr:2131          top_exprs.append((expr, pos, is_first))2132    top_words = []2133    for expr, pos, is_first in top_exprs:2134      manner_match = regex.search(r"^in +([-\p{Latin}].*?) +(manner|fashion|way)$",2135                                  expr, regex.IGNORECASE)2136      preps = ["of", "in", "at", "from", "by", "part of", "out of", "inside",2137               "relating to", "related to", "associated with",2138               "characterized by", "pertaining to", "derived from"]2139      prep_expr = None2140      for prep in preps:2141        if len(expr) > len(prep):2142          if expr[:len(prep)].lower() == prep:2143            expr_lead = expr[len(prep):]2144            joint_match = regex.match(r"^,?( +or)? +", expr_lead)2145            if joint_match:2146              expr = expr_lead[joint_match.end():]2147              prep_expr = expr2148      if manner_match:2149        expr = manner_match.group(1).strip()2150        expr = regex.sub(r"^(a|an|the) +", "", expr, flags=regex.IGNORECASE)2151        if expr:2152          top_words.append((expr, "adjective", "adverb", is_first))2153      elif prep_expr:2154        expr = regex.sub(r"^(a|an|the) +([\p{Latin}])", r"\2", prep_expr, flags=regex.IGNORECASE)2155        if expr:2156          new_pos = "adverb" if pos == "adverb" else "adjective"2157          top_words.append((expr, "noun", new_pos, is_first))2158      else:2159        expr = expr.strip()2160        if expr:2161          top_words.append((expr, pos, "", is_first))2162    etym_prefix = entry.get("etymology_prefix")2163    etym_core = entry.get("etymology_core")2164    etym_suffix = entry.get("etymology_suffix")2165    if ("noun" in poses and not etym_prefix and etym_core and2166        etym_suffix in ("ness", "cy", "ity")):2167      top_words.append((etym_core, "adjective", "noun", True))2168    if ("noun" in poses and not etym_prefix and etym_core and2169        etym_suffix in ("ment", "tion", "sion")):2170      top_words.append((etym_core, "verb", "noun", True))2171    if ("verb" in poses and not etym_prefix and etym_core and2172        etym_suffix in ("ise", "ize")):2173      top_words.append((etym_core, "adjective", "verb", True))2174    if ("adjective" in poses and not etym_prefix and etym_core2175        and etym_suffix in ("ic", "ical", "ish", "ly")):2176      top_words.append((etym_core, "noun", "adjective", True))2177    if ("adverb" in poses and not etym_prefix and etym_core and2178        etym_suffix == "ly"):2179      top_words.append((etym_core, "adjective", "adverb", True))2180    parents = entry.get("parent")2181    if parents:2182      for parent in parents:2183        if len(parent) < 5: continue2184        if ("noun" in poses and2185            (word.endswith("ness") or word.endswith("cy") or word.endswith("ity"))):2186          top_words.append((parent, "adjective", "noun", True))2187        if ("noun" in poses and2188            (word.endswith("ment") or word.endswith("tion") or word.endswith("sion"))):2189          top_words.append((parent, "verb", "noun", True))2190        if ("verb" in poses and2191            (word.endswith("ise") or word.endswith("tze"))):2192          top_words.append((parent, "adjective", "verb", True))2193        if ("adjective" in poses and2194            (word.endswith("ic") or word.endswith("ical") or word.endswith("ish"))):2195          top_words.append((parent, "noun", "adjective", True))2196        if ("adverb" in poses and2197            word.endswith("ly")):2198          top_words.append((parent, "adjective", "adverb", True))2199    ent_synonyms = entry.get("_synonym")2200    if ent_synonyms:2201      for synonym in ent_synonyms:2202        norm_synonym = tkrzw_dict.NormalizeWord(synonym)2203        syn_entries = merged_dict.get(norm_synonym)2204        if syn_entries:2205          syn_pos = ""2206          for syn_entry in syn_entries:2207            if syn_entry["word"] != synonym: continue2208            for syn_item in syn_entry["item"]:2209              if syn_item["pos"] in poses:2210                syn_pos = syn_item["pos"]2211                break2212          if syn_pos:2213            synonyms.append((synonym, syn_pos))2214    for synonym, pos in synonyms:2215      top_words.append((synonym, pos, "", False))2216    trans = []2217    tran_sources = set()2218    for expr, pos, conversion, trustable in top_words:2219      expr = regex.sub(r"^([-\p{Latin}]+), ([-\p{Latin}]+),? +or +([-\p{Latin}]+)$",2220                       r"\1; \2; \3", expr)2221      expr = regex.sub(r"^([-\p{Latin}]+) +or +([-\p{Latin}]+)$", r"\1; \2", expr)2222      expr = regex.sub(r"^([-\p{Latin}]+), +([-\p{Latin}]+)$", r"\1; \2", expr)2223      for rel_word in expr.split(";"):2224        rel_word = rel_word.strip()2225        if len(rel_word) <= 2: continue2226        word_trans = self.GetEntryTranslations(merged_dict, rel_word, is_capital, pos)2227        if not word_trans: continue2228        new_pos = conversion or pos2229        if new_pos == "noun":2230          word_trans = [self.MakeTranNoun(x) for x in word_trans]2231        elif new_pos == "verb":2232          word_trans = [self.MakeTranVerb(x) for x in word_trans]2233        elif new_pos == "adjective":2234          word_trans = [self.MakeTranAdjective(x) for x in word_trans]2235        elif new_pos == "adverb":2236          word_trans = [self.MakeTranAdverb(x) for x in word_trans]2237        for rank, word_tran in enumerate(word_trans):2238          tran_source = (word_tran, rel_word)2239          if tran_source in tran_sources: continue2240          tran_sources.add(tran_source)2241          trans.append((word_tran, trustable, rel_word, rank))2242    prob_trans = {}2243    key = tkrzw_dict.NormalizeWord(word)2244    tsv = tran_prob_dbm.GetStr(key)2245    if tsv:2246      fields = tsv.split("\t")2247      for i in range(0, len(fields), 3):2248        src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])2249        if regex.search("[ã£ã]$", trg) and self.tokenizer.GetJaLastPos(trg)[1] == "åè©":2250          continue2251        norm_trg = tkrzw_dict.NormalizeWord(trg)2252        prob = float(prob)2253        if src != word:2254          prob *= 0.12255        prob_trans[norm_trg] = max(prob_trans.get(norm_trg) or 0.0, prob)2256    scored_trans = []2257    tran_counts = {}2258    for tran, trustable, rel_word, rank in trans:2259      tran_counts[tran] = (tran_counts.get(tran) or 0) + 12260    for tran, trustable, rel_word, rank in trans:2261      norm_tran = tkrzw_dict.NormalizeWord(tran)2262      max_weight = 02263      prob_hit = False2264      for prob_tran, prob in prob_trans.items():2265        prob **= 0.252266        dist = tkrzw.Utility.EditDistanceLev(norm_tran, prob_tran)2267        dist /= max(len(norm_tran), len(prob_tran))2268        weight = prob ** 0.5 + 2.0 - dist2269        if norm_tran == prob_tran:2270          weight *= 102271          prob_hit = True2272        elif len(prob_tran) >= 2 and norm_tran.startswith(prob_tran):2273          weight *= 52274          prob_hit = True2275        elif len(norm_tran) >= 2 and prob_tran.startswith(norm_tran):2276          weight *= 52277          prob_hit = True2278        elif len(prob_tran) >= 2 and norm_tran.find(prob_tran) >= 0:2279          weight *= 32280          prob_hit = True2281        elif len(norm_tran) >= 2 and prob_tran.find(norm_tran) >= 0:2282          weight *= 32283          prob_hit = True2284        elif dist < 0.3:2285          weight *= 22286          prob_hit = True2287        max_weight = max(max_weight, weight)2288      if not trustable and not prob_hit:2289        continue2290      tran_count = tran_counts[tran]2291      count_score = 1 + (tran_count * 0.2)2292      rank_score = 0.95 ** rank2293      score = max_weight * count_score * rank_score2294      scored_trans.append((tran, score, prob_hit))2295    scored_trans = sorted(scored_trans, key=lambda x: x[1], reverse=True)2296    rec_aux_trans = aux_last_trans.get(word)2297    if rec_aux_trans:2298      scored_aux_trans = []2299      for aux_tran in rec_aux_trans:2300        norm_trg = tkrzw_dict.NormalizeWord(aux_tran)2301        prob = prob_trans.get(norm_trg) or 0.02302        prob += 0.01 / (len(aux_tran) + 1)2303        scored_aux_trans.append((aux_tran, prob))2304      scored_aux_trans = sorted(scored_aux_trans, key=lambda x: x[1], reverse=True)2305      for aux_tran, score in scored_aux_trans:2306        scored_trans.append((aux_tran, 0, False))2307    final_trans = []2308    uniq_trans = set()2309    for tran in old_trans:2310      norm_tran = tkrzw_dict.NormalizeWord(tran)2311      uniq_trans.add(norm_tran)2312      final_trans.append(tran)2313    num_rank = 02314    for tran, score, prob_hit in scored_trans:2315      if len(final_trans) >= 8: break2316      norm_tran = tkrzw_dict.NormalizeWord(tran)2317      if norm_tran in uniq_trans: continue2318      num_rank += 12319      if not prob_hit:2320        if num_rank > 3: continue2321        if num_rank > 2 and len(final_trans) >= 3: continue2322      uniq_trans.add(norm_tran)2323      final_trans.append(tran)2324    if final_trans:2325      entry["translation"] = final_trans2326  def MakeTranNoun(self, tran):2327    pos = self.tokenizer.GetJaLastPos(tran)2328    stem = self.tokenizer.CutJaWordNounParticle(tran)2329    if tran.endswith("ãã"):2330      tran = tran[:-2]2331    elif tran.endswith("ããã"):2332      tran = tran[:-3]2333    elif tran.endswith("ããã"):2334      tran = tran[:-3]2335    elif tran.endswith("ãããª"):2336      tran = tran[:-3]2337    elif self.tokenizer.IsJaWordAdjvNoun(stem):2338      tran = stem2339    elif tran.endswith("ã") and pos[1] == "å½¢å®¹è©":2340      tran = tran[:-1] + "ã"2341    elif pos[1] in "åè©" and regex.search(r"[ãããã¤ã¬ãµããã]$", tran):2342      tran = tran + "ãã¨"2343    elif pos[1] in "å½¢å®¹è©" and regex.search(r"[ãã]$", tran):2344      tran = tran + "ãã¨"2345    elif pos[0] in ("ã", "ãª") and pos[1] == "å©åè©":2346      tran = tran + "ãã¨"2347    return tran2348  def MakeTranVerb(self, tran):2349    pos = self.tokenizer.GetJaLastPos(tran)2350    if self.tokenizer.IsJaWordSahenNoun(tran):2351      tran = tran + "ãã"2352    elif tran.endswith("ã") and pos[1] == "å½¢å®¹è©":2353      tran = tran[:-1] + "ããã"2354    elif pos[1] == "åè©" and pos[2] == "å½¢å®¹åè©èªå¹¹":2355      tran = tran + "ã«ãã"2356    return tran2357  def MakeTranAdjective(self, tran):2358    pos = self.tokenizer.GetJaLastPos(tran)2359    stem = self.tokenizer.CutJaWordNounParticle(tran)2360    is_adjv = False2361    if tran.endswith("ãã"):2362      tran = tran[:-2]2363    elif tran.endswith("ããã"):2364      tran = tran[:-3]2365    elif tran.endswith("ããã"):2366      tran = tran[:-3]2367    elif tran.endswith("ãããª"):2368      tran = tran[:-3]2369    elif self.tokenizer.IsJaWordAdjvNoun(stem):2370      tran = stem2371      is_adjv = True2372    pos = self.tokenizer.GetJaLastPos(tran)2373    if self.tokenizer.IsJaWordAdjvNounOnly(tran):2374      tran += "ãª"2375    elif pos[1] == "åè©":2376      if tran.endswith("ç"):2377        tran += "ãª"2378      else:2379        tran += "ã®"2380    return tran2381  def MakeTranAdverb(self, tran):2382    pos = self.tokenizer.GetJaLastPos(tran)2383    stem = self.tokenizer.CutJaWordNounParticle(tran)2384    if tran.endswith("ãã"):2385      tran = tran[:-2] + "ãã¦"2386    elif tran.endswith("ããã"):2387      tran = tran[:-3] + "ããã¦"2388    elif tran.endswith("ããã"):2389      tran = tran[:-3] + "ããã¦"2390    elif tran.endswith("ãããª"):2391      tran = tran[:-3] + "ããã«"2392    elif tran.endswith("ããã"):2393      tran = tran[:-3] + "ããã"2394    elif tran.endswith("ã¨ãã"):2395      tran = tran[:-3] + "ã¨ãã¦"2396    elif tran.endswith("ã") and pos[1] == "å½¢å®¹è©":2397      tran = tran[:-1] + "ã"2398    elif tran.endswith("çãª"):2399      tran = tran[:-1] + "ã«"2400    elif self.tokenizer.IsJaWordSahenNoun(stem):2401      tran = stem + "ãã¦"2402    elif self.tokenizer.IsJaWordAdjvNoun(stem):2403      tran = stem + "ã«"2404    elif stem != tran or pos[1] == "åè©":2405      tran = stem + "ã§"2406    elif pos[0] == "ã" and pos[1] == "å©åè©":2407      tran = tran[:-1] + "ã¦"2408    elif pos[1] == "åè©":2409      tran = stem + "ããã«"2410    return tran2411  def SetPhraseTranslations(self, entry, merged_dict, aux_trans, aux_last_trans,2412                            tran_prob_dbm, phrase_prob_dbm, noun_words, verb_words,2413                            live_words, rev_live_words):2414    if not tran_prob_dbm or not phrase_prob_dbm:2415      return2416    word = entry["word"]2417    if not regex.fullmatch(r"[-\p{Latin}]+", word):2418      return2419    if len(word) < 2 or word in ("an", "the"):2420      return2421    is_noun = word in noun_words2422    is_verb = word in verb_words2423    word_prob = float(phrase_prob_dbm.GetStr(word) or 0.0)2424    if word_prob < 0.00001:2425      return2426    word_mod_prob = min(word_prob, 0.001)2427    norm_word = " ".join(self.tokenizer.Tokenize("en", word, True, True))2428    if word != norm_word:2429      return2430    phrases = []2431    for particle in particles:2432      phrase = word + " " + particle2433      phrase_prob = float(phrase_prob_dbm.GetStr(phrase) or 0.0)2434      ratio = phrase_prob / word_mod_prob2435      if is_verb and ratio >= 0.005:2436        for pron in ("me", "us", "you", "him", "her", "it", "them"):2437          pron_phrase = word + " " + pron + " " + particle2438          pron_phrase_prob = float(phrase_prob_dbm.GetStr(pron_phrase) or 0.0)2439          if pron_phrase_prob > 0.0:2440            phrase_prob += pron_phrase_prob * 2.02441            ratio = phrase_prob / word_mod_prob2442      phrases.append((phrase, True, ratio, ratio, phrase_prob))2443      if ratio >= 0.005:2444        for sub_particle in particles:2445          sub_phrase = phrase + " " + sub_particle2446          sub_phrase_prob = float(phrase_prob_dbm.GetStr(sub_phrase) or 0.0)2447          sub_ratio = max(sub_phrase_prob / phrase_prob, 0.01)2448          phrases.append((sub_phrase, True, max(sub_ratio, ratio),2449                          ratio * (sub_ratio ** 0.005), sub_phrase_prob))2450    verb_prob = 0.02451    if is_verb:2452      for auxverb in ("not", "will", "shall", "can", "may", "must"):2453        auxverb_prob = float(phrase_prob_dbm.GetStr(auxverb + " " + word) or 0.0)2454        verb_prob += auxverb_prob2455      verb_prob *= 202456    for particle in particles:2457      phrase = particle + " " + word2458      phrase_prob = float(phrase_prob_dbm.GetStr(phrase) or 0.0)2459      if particle == "to":2460        phrase_prob -= verb_prob2461      ratio = phrase_prob / word_mod_prob2462      phrases.append((phrase, False, ratio, ratio, phrase_prob))2463      if is_noun:2464        for art in ("the", "a", "an"):2465          sub_phrase = particle + " " + art + " " + word2466          sub_phrase_prob = float(phrase_prob_dbm.GetStr(sub_phrase) or 0.0)2467          sub_ratio = sub_phrase_prob / word_mod_prob2468          phrases.append((sub_phrase, False, sub_ratio, sub_ratio, sub_phrase_prob))2469    it = live_words.MakeIterator()2470    it.Jump(word + " ")2471    while True:2472      rec = it.GetStr()2473      if not rec: break2474      phrase, phrase_prob = rec2475      if not phrase.startswith(word + " "): break2476      phrase_prob = float(phrase_prob)2477      ratio = phrase_prob / word_prob2478      if ratio >= 0.05:2479        phrases.append((phrase, True, ratio, ratio, phrase_prob))2480      it.Next()2481    it = rev_live_words.MakeIterator()2482    it.Jump(word + " ")2483    while True:2484      rec = it.GetStr()2485      if not rec: break2486      phrase, phrase_prob = rec2487      if not phrase.startswith(word + " "): break2488      phrase_prob = float(phrase_prob)2489      ratio = phrase_prob / word_prob2490      if ratio >= 0.05:2491        phrase = " ".join(reversed(phrase.split(" ")))2492        phrases.append((phrase, True, ratio, ratio, phrase_prob))2493      it.Next()2494    if not phrases:2495      return2496    orig_trans = {}2497    tsv = tran_prob_dbm.GetStr(word)2498    if tsv:2499      fields = tsv.split("\t")2500      for i in range(0, len(fields), 3):2501        src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])2502        trg = regex.sub(r"[ï½ã]", "", trg)2503        trg, trg_prefix, trg_suffix = self.tokenizer.StripJaParticles(trg)2504        if src == word and prob >= 0.06:2505          orig_trans[trg] = prob2506    aux_orig_trans = (aux_trans.get(word) or []) + (aux_last_trans.get(word) or [])2507    if aux_orig_trans:2508      for trg in set(aux_orig_trans):2509        trg = regex.sub(r"[ï½ã]", "", trg)2510        trg, trg_prefix, trg_suffix = self.tokenizer.StripJaParticles(trg)2511        orig_trans[trg] = float(orig_trans.get(trg) or 0) + 0.12512    ent_orig_trans = entry.get("translation")2513    if ent_orig_trans:2514      base_score = 0.12515      for ent_orig_tran in ent_orig_trans:2516        orig_trans[ent_orig_tran] = float(orig_trans.get(ent_orig_tran) or 0) + base_score2517        base_score *= 0.92518    final_phrases = []2519    uniq_phrases = set()2520    for phrase, is_suffix, mod_prob, phrase_score, raw_prob in phrases:2521      if phrase in uniq_phrases: continue2522      uniq_phrases.add(phrase)2523      phrase_trans = {}2524      phrase_prefixes = {}2525      pos_match = is_verb if is_suffix else is_noun2526      if mod_prob >= 0.02:2527        if pos_match:2528          tsv = tran_prob_dbm.GetStr(phrase)2529          if tsv:2530            fields = tsv.split("\t")2531            for i in range(0, len(fields), 3):2532              src, trg, prob = fields[i], fields[i + 1], float(fields[i + 2])2533              if src != phrase:2534                continue2535              if regex.search("[ã£ã]$", trg) and self.tokenizer.GetJaLastPos(trg)[1] == "åè©":2536                continue2537              if (is_verb and regex.search("[ãããã¡ã«ã²ã¿ã]$", trg) and2538                  self.tokenizer.GetJaLastPos(trg)[1] == "åè©"):2539                continue2540              trg = regex.sub(r"[ï½ã]", "", trg)2541              trg, trg_prefix, trg_suffix = self.tokenizer.StripJaParticles(trg)2542              if not trg or regex.fullmatch(r"[\p{Katakana}ã¼]+", trg):2543                continue2544              pos = self.tokenizer.GetJaLastPos(trg)2545              if (is_noun and is_suffix and pos[1] == "åè©" and2546                  not self.tokenizer.IsJaWordSahenNoun(trg)):2547                continue2548              if is_noun and is_suffix and trg in ("ãã", "ãã", "ã§ã", "ã¾ã"):2549                continue2550              orig_prob = orig_trans.get(trg) or 0.02551              if is_verb:2552                if self.tokenizer.IsJaWordSahenNoun(trg):2553                  orig_prob = max(orig_prob, orig_trans.get(trg + "ãã") or 0.0)2554                for ext_suffix in ("ãã", "ãã", "ãã¦", "ããã", "ããã", "ããã¦"):2555                  orig_prob = max(orig_prob, orig_trans.get(trg[:len(ext_suffix)]) or 0.0)2556              if (is_suffix and is_verb and not trg_prefix and trg_suffix and2557                  (pos[1] == "åè©" or self.tokenizer.IsJaWordSahenNoun(trg))):2558                trg_prefix = trg_suffix2559                trg_suffix = ""2560              elif is_suffix and is_noun and not trg_prefix:2561                if trg_suffix == "ã®ãã":2562                  trg_suffix = "ããã®"2563                trg_prefix = trg_suffix2564                trg_suffix = ""2565              elif not trg_suffix and trg_prefix in ("ããã®", "ã®ãã"):2566                if trg.endswith("ãã"):2567                  trg += "ããã®"2568                else:2569                  trg += "ã®ãã"2570                trg_prefix = ""2571              elif trg_suffix:2572                trg += trg_suffix2573              sum_prob = orig_prob + prob2574              if sum_prob >= 0.1:2575                if is_verb and pos[1] == "åè©":2576                  sum_prob += 0.12577                phrase_trans[trg] = float(phrase_trans.get(trg) or 0.0) + sum_prob2578                if trg_prefix and not trg_suffix:2579                  part_key = trg + ":" + trg_prefix2580                  phrase_prefixes[part_key] = float(phrase_trans.get(part_key) or 0.0) + sum_prob2581        for aux_phrase_trans in (aux_trans.get(phrase), aux_last_trans.get(phrase)):2582          if aux_phrase_trans:2583            for trg in aux_phrase_trans:2584              trg = regex.sub(r"[ï½ã]", "", trg)2585              trg, trg_prefix, trg_suffix = self.tokenizer.StripJaParticles(trg)2586              if is_noun and is_suffix and trg in ("ãã", "ãã", "ã§ã", "ã¾ã"):2587                continue2588              phrase_trans[trg] = float(phrase_trans.get(trg) or 0.0) + 0.12589      if mod_prob >= 0.001:2590        phrase_entries = merged_dict.get(phrase)2591        if phrase_entries:2592          for phrase_entry in phrase_entries:2593            if phrase_entry["word"] != phrase: continue2594            ent_phrase_trans = phrase_entry.get("translation")2595            if ent_phrase_trans:2596              base_score = 0.152597              for trg in ent_phrase_trans:2598                trg, trg_prefix, trg_suffix = self.tokenizer.StripJaParticles(trg)2599                phrase_trans[trg] = float(phrase_trans.get(trg) or 0.0) + base_score2600                if trg_prefix and not trg_suffix:2601                  part_key = trg + ":" + trg_prefix2602                  phrase_prefixes[part_key] = float(phrase_trans.get(part_key) or 0.0) + base_score2603                base_score *= 0.92604      if not phrase_trans:2605        continue2606      for tran in list(phrase_trans.keys()):2607        if not regex.search(r"[\p{Han}\p{Katakana}]", tran):2608          continue2609        for cmp_tran, cmp_score in list(phrase_trans.items()):2610          if cmp_tran not in phrase_trans: continue2611          if cmp_tran.startswith(tran):2612            suffix = cmp_tran[len(tran):]2613            if suffix in ("ãã", "ããã", "ããã", "ã«", "ãª", "ã®"):2614              phrase_trans[cmp_tran] = cmp_score + float(phrase_trans.get(tran) or 0)2615              if tran in phrase_trans:2616                del phrase_trans[tran]2617      mod_trans = {}2618      for tran, score in phrase_trans.items():2619        prefix_check = tran + ":"2620        best_prefix = ""2621        best_prefix_score = 0.02622        for prefix, score in phrase_prefixes.items():2623          if not prefix.startswith(prefix_check): continue2624          if score >= best_prefix_score:2625            best_prefix = prefix[len(prefix_check):]2626            best_prefix_score = score2627        if regex.search(r"^[\p{Katakana}ã¼]", tran):2628          score *= 0.52629        pos = self.tokenizer.GetJaLastPos(tran)2630        if is_suffix and is_verb:2631          if pos[1] == "åè©" and regex.search("[ãããã¤ã¬ãµããã]$", tran):2632            score *= 1.52633          if pos[1] == "åè©" and not self.tokenizer.IsJaWordSahenNoun(tran):2634            score *= 0.52635        if not is_suffix and pos[1] == "åè©" and not best_prefix:2636          if self.tokenizer.IsJaWordSahenNoun(tran) or self.tokenizer.IsJaWordAdjvNoun(tran):2637            score *= 0.72638          else:2639            score *= 0.52640        if len(tran) <= 1:2641          score *= 0.52642        if is_verb:2643          orig_tran = tran2644          pos = self.tokenizer.GetJaLastPos(tran)2645          if self.tokenizer.IsJaWordSahenNoun(tran) and best_prefix != "ã®":2646            tran = tran + "ãã"2647        if best_prefix and best_prefix not in ("ã", "ã", "ã¯"):2648          tran = "({}){}".format(best_prefix, tran)2649        mod_trans[tran] = float(mod_trans.get(tran) or 0.0) + score2650      scored_trans = sorted(mod_trans.items(), key=lambda x: x[1], reverse=True)[:4]2651      if scored_trans:2652        final_phrases.append((phrase, phrase_score, raw_prob, [x[0] for x in scored_trans]))2653    if final_phrases:2654      final_phrases = sorted(final_phrases, key=lambda x: x[1], reverse=True)2655      map_phrases = []2656      for phrase, score, raw_prob, trans in final_phrases:2657        prob_expr = "{:.6f}".format(raw_prob / word_prob).replace("0.", ".")2658        map_phrase = {"w": phrase, "p": prob_expr, "x": trans}2659        if phrase in merged_dict:2660          map_phrase["i"] = "1"2661        map_phrases.append(map_phrase)2662      entry["phrase"] = map_phrases2663  def FilterParents(self, word_entry, merged_dict):2664    word = word_entry["word"]2665    parents = word_entry.get("parent")2666    if not parents or len(parents) < 2: return2667    ancestors = parents2668    while True:2669      grand_ancestors = []2670      for ancestor in ancestors:2671        ancestor_entries = merged_dict.get(ancestor)2672        if ancestor_entries:2673          for ancestor_entry in ancestor_entries:2674            if ancestor_entry["word"] != ancestor: continue2675            for grand_ancestor in ancestor_entry.get("parent") or []:2676              if grand_ancestor in ancestors and grand_ancestor not in grand_ancestors:2677                grand_ancestors.append(grand_ancestor)2678      if not grand_ancestors or len(grand_ancestors) == len(ancestors):2679        break2680      ancestors = grand_ancestors2681    scored_parents = []2682    for parent in parents:2683      parent_prob = 02684      parent_entries = merged_dict.get(parent)2685      if parent_entries:2686        for parent_entry in parent_entries:2687          if parent_entry["word"] != parent: continue2688          parent_prob = float(parent_entry.get("probability")) or 02689      score = parent_prob + 0.0000012690      if word.startswith(parent):2691        score *= 22692      if parent in ancestors:2693        score += 12694      else:2695        is_dup = False2696        for suffix in ("ing", "ed", "er", "or", "ism", "ist", "est"):2697          for ancestor in ancestors:2698            candidate = ancestor + suffix2699            if (parent[:3] == candidate[:3] and2700                tkrzw.Utility.EditDistanceLev(parent, candidate) < 2):2701              is_dup = True2702        if is_dup:2703          continue2704      scored_parents.append((parent, score))2705    scored_parents = sorted(scored_parents, key=lambda x: x[1], reverse=True)2706    word_entry["parent"] = [x[0] for x in scored_parents]2707  def AbsorbInflections(self, word_entry, merged_dict):2708    word = word_entry["word"]2709    infls = []2710    for infl_name in inflection_names:2711      infl_value = word_entry.get(infl_name)2712      if infl_value:2713        for infl in infl_value.split(","):2714          infl = infl.strip()2715          if infl and infl != word and infl not in infls:2716            infls.append(infl)2717    phrases = []2718    for infl in infls:2719      infl_entries = merged_dict.get(infl)2720      if not infl_entries: continue2721      for infl_entry in infl_entries:2722        if infl_entry["word"] != infl: continue2723        is_core = False2724        good_labels = set()2725        num_good_items = 02726        for infl_item in infl_entry["item"]:2727          label = infl_item["label"]2728          text = infl_item["text"]2729          if label in self.supplement_labels: continue2730          if regex.search(r"^\[\w+]:", text): continue2731          good_labels.add(label)2732          if label in self.core_labels:2733            is_core = True2734          num_good_items += 12735        alive = True2736        if len(good_labels) < 2 and not is_core and num_good_items < 3:2737          infl_entry["deleted"] = True2738          alive = False2739        infl_trans = infl_entry.get("translation")2740        if infl_trans:2741          phrase = {"w": infl, "x": infl_trans[:4]}2742          if alive:2743            phrase["i"] = "1"2744          phrases.append(phrase)2745    if phrases:2746      old_phrases = word_entry.get("phrase")2747      if old_phrases:2748        phrases = phrases + old_phrases2749      word_entry["phrase"] = phrases2750def main():2751  args = sys.argv[1:]2752  output_path = tkrzw_dict.GetCommandFlag(args, "--output", 1) or "union-body.tkh"2753  core_labels = set((tkrzw_dict.GetCommandFlag(args, "--core", 1) or "xa,wn").split(","))2754  full_def_labels = set((tkrzw_dict.GetCommandFlag(2755    args, "--full_def", 1) or "ox,wn,we").split(","))2756  gross_labels = set((tkrzw_dict.GetCommandFlag(args, "--gross", 1) or "wj").split(","))2757  top_labels = set((tkrzw_dict.GetCommandFlag(args, "--top", 1) or "we,lx,xa").split(","))2758  slim_labels = set((tkrzw_dict.GetCommandFlag(args, "--slim", 1) or "ox,we,wj").split(","))2759  surfeit_labels = set((tkrzw_dict.GetCommandFlag(args, "--surfeit", 1) or "we").split(","))2760  tran_list_labels = set((tkrzw_dict.GetCommandFlag(2761    args, "--tran_list", 1) or "xa,wn,we").split(","))2762  supplement_labels = set((tkrzw_dict.GetCommandFlag(args, "--supplement", 1) or "xs").split(","))2763  phrase_prob_path = tkrzw_dict.GetCommandFlag(args, "--phrase_prob", 1) or ""2764  tran_prob_path = tkrzw_dict.GetCommandFlag(args, "--tran_prob", 1) or ""2765  tran_aux_paths = (tkrzw_dict.GetCommandFlag(args, "--tran_aux", 1) or "").split(",")2766  tran_aux_last_paths = (tkrzw_dict.GetCommandFlag(args, "--tran_aux_last", 1) or "").split(",")2767  rev_prob_path = tkrzw_dict.GetCommandFlag(args, "--rev_prob", 1) or ""2768  cooc_prob_path = tkrzw_dict.GetCommandFlag(args, "--cooc_prob", 1) or ""2769  aoa_paths = (tkrzw_dict.GetCommandFlag(args, "--aoa", 1) or "").split(",")2770  keyword_path = tkrzw_dict.GetCommandFlag(args, "--keyword", 1) or ""2771  min_prob_exprs = tkrzw_dict.GetCommandFlag(args, "--min_prob", 1) or ""2772  min_prob_map = {}2773  for min_prob_expr in min_prob_exprs.split(","):2774    columns = min_prob_expr.split(":")2775    if len(columns) == 2:2776      min_prob_map[columns[0]] = float(columns[1])2777  if tkrzw_dict.GetCommandFlag(args, "--quiet", 0):2778    logger.setLevel(logging.ERROR)2779  unused_flag = tkrzw_dict.GetUnusedFlag(args)2780  if unused_flag:2781    raise RuntimeError("Unknow flag: " + unused_flag)2782  inputs = tkrzw_dict.GetArguments(args)2783  if not inputs:2784    raise RuntimeError("inputs are required")2785  input_confs = []2786  for input in inputs:2787    input_conf = input.split(":", 1)2788    if len(input_conf) != 2:2789      raise RuntimeError("invalid input: " + input)2790    input_confs.append(input_conf)2791  BuildUnionDBBatch(input_confs, output_path, core_labels, full_def_labels, gross_labels,2792                    surfeit_labels, top_labels, slim_labels, tran_list_labels, supplement_labels,2793                    phrase_prob_path, tran_prob_path, tran_aux_paths, tran_aux_last_paths,2794                    rev_prob_path, cooc_prob_path, aoa_paths, keyword_path,2795                    min_prob_map).Run()2796if __name__=="__main__":...

shorte_engine.py

Source:shorte_engine.py

...377    def add_wikiword(self, word):378        if(self.m_wiki_links.has_key(word)):379            FATAL("Wikiword %s already exists" % word)380        self.m_wiki_links[word.wikiword] = word381    def is_wiki_word(self, phrase):382        '''Returns the target link if the phrase is a wikiword383           or None if it does not exist'''384        link = None385        if(self.m_wiki_links.has_key(phrase)):386            link = self.m_wiki_links[phrase]387        return link388    def inkscape_to_png(self, name):389        '''This method is called to convert an inkscape390           SVG to PNG format for embedding in a document'''391        input = os.path.abspath(name)392        parts = os.path.splitext(input)393        basename = parts[0]394        #print "input = %s" % input395        #print "basename = %s" % basename...

template_sql.py

Source:template_sql.py

...294                    if(tmp == word):295                        exclude_word = True296                        break297                if(not exclude_word):298                    link = self.m_engine.is_wiki_word(word)299                if(link != None):300                    tmp = "<a href='%s#%s'>%s</a>" % (self.get_output_path(link), word, word)301                    #print "WIKIWORD: %s" % tmp302                    output += tmp303                else:304                    #print "HERE I AM: %s" % word305                    output += word306        return output307    308    def format_text(self, data, allow_wikify=True, exclude_wikify=[], expand_equals_block=False):309        if(data == None):310            return311        if(len(data) != 0):312            data = re.sub("'", "&apos;", data)...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.