Unlock 30% off on Manual Testing Annual Plans this Holiday Season.

Copied to Clipboard!

How to use rule_keywords method in Gherkin-python

Best Python code snippet using gherkin-python

user_alerts_mon_2013_ref.py

Source:user_alerts_mon_2013_ref.py

1"""2Created on Feb 21, 20133@author: wmechem4This is the finRocket ALERT module.  It is code to monitor5incoming stories and process alerts based on settings saved by users.6Alerts may include ENTITIES, TICKERS, contained in the story title or7body as well as SENTIMENT etc.  See parse_rules()8Uses ZMQ to listen to incoming messages.9Alerts are sent via EMAIL or SMS based on user's preferences10pylint 7.5 / 10 2016/05/15 - needs refactoring11"""12from datetime import datetime13from email.mime.text import MIMEText14import simplejson as json15import logging16from multiprocessing import Process, Queue17import os18import re19import smtplib20import threading21import time22from flask import Flask23from flask.ext.sqlalchemy import SQLAlchemy24import MySQLDB as mdb25from pandas import DataFrame26from pandas.io import sql27import zmq28#setup ZMQ29UA_CONTEXT = zmq.Context()30USER_ALERTS_PULL_ADDR = 'tcp://127.0.0.1:6040'31print "user_alerts module 2013_09_09_localIP"32HOME_DIR = os.environ['HOME']33print 'Home DIR is ', HOME_DIR34LOAD_ALERT_FREQ = 300  # seconds before checking for changes35DB_HOST = 'xxx5-22.compute-1.amazonaws.com'36ALERT_DB_LIMIT = 1000  # number of alerts to load37ALERT_DB_OFFSET = 038logging.basicConfig(filename=HOME_DIR + "/kj_alerts.log", level=logging.DEBUG)39logging.info("Starting User Alerts Monitor @ " + str(datetime.now()))40MODULE_NAME = "User Alerts Monitor"41VERBOSE = 0  # set VERBOSE != 1 to turn off extra logging42APP = Flask(__name__)43DB = SQLAlchemy(APP)44APP.config['SQLALCHEMY_DATABASE_URI'] = 'mysql://kj_user:pwd@' + (45    DB_HOST + '/kj_feb_2013_01')46USER_ALERTS = ""47class UserProfile(DB.Model):48    """ Access user profiles. """49    def __init__(self):50        pass51    __tablename__ = 'kj_users'52    id_ = DB.Column(DB.Integer, primary_key=True)53    username = DB.Column(DB.String(80))54    userpasswd = DB.Column(DB.String(80))55    user_created_date = DB.Column(DB.Date)56    user_email = DB.Column(DB.String(100))57    user_mobile = DB.Column(DB.String(40))58    user_carrier = DB.Column(DB.String(40))59class UserAlert(DB.Model):60    """ Access user alerts. """61    __tablename__ = 'kj_user_alerts'62    id_ = DB.Column(DB.Integer, primary_key=True)63    username = DB.Column(DB.String(80))64    user_alert_name = DB.Column(DB.String(20))65    user_alert_created_date = DB.Column(DB.Date)66    user_alert_scope = DB.Column(DB.String(80))67    user_alert_condition = DB.Column(DB.String(40))68    user_alert_keywords = DB.Column(DB.String(2000))69    user_alert_actions = DB.Column(DB.String(40))70    user_alert_triggered_state = DB.Column(DB.String(40))71    user_alert_triggered_time = DB.Column(DB.Date)72    user_alert_delet = DB.Column(DB.Integer(1))73    user_alert_ext_op = DB.Column(DB.String(10))74    user_alert_ext_scop = DB.Column(DB.String(80))75    user_alert_ext_conditio = DB.Column(DB.String(40))76    user_alert_ext_keywords = DB.Column(DB.String(2000))77    def __init__(self, id_, username, a_name, a_created_date, a_scope,78        a_condition, a_keywords, a_actions, a_state, a_state_time,79        a_delete, a_ext_op, a_ext_scope, a_ext_condition, a_ext_keywords):80        """ Initialize alert attributes. """81        self.id_ = id_82        self.username = username83        self.user_alert_name = a_name84        self.user_alert_created_date = a_created_date85        self.user_alert_scope = a_scope86        self.user_alert_condition = a_condition87        self.user_alert_keywords = a_keywords88        self.user_alert_actions = a_actions89        self.user_alert_triggered_state = a_state90        self.user_alert_triggered_time = a_state_time91        self.user_alert_delete = a_delete92        self.user_alert_ext_op = a_ext_op93        self.user_alert_ext_scope = a_ext_scope94        self.user_alert_ext_condition = a_ext_condition95        self.user_alert_ext_keywords = a_ext_keywords96def do_log(msg):97    """ Generic logging function. """98    print msg99    if VERBOSE == 1:100        l_string = (MODULE_NAME + " "101            + msg + " " + str(datetime.now()))102        logging.info(l_string)103    else:104        return105    return106def load_user_alerts_df():107    """ Load user alerts. """108    con = mdb.connect(DB_HOST, 'kj_user', 'xxx', 'kj_feb_2013_01')109    dataframe = sql.read_frame("SELECT * FROM kj_user_alerts;", con)110    con.close()111    return dataframe112def load_user_alerts_in_q(user_alerts_rules_q):113    """ Periodically load user alerts in to a queue114    so that we can detect changes.115    """116    while True:117        try:118            #  Keep getting queue until empty then replace with rules from db119            rules = user_alerts_rules_q.get_nowait()120            user_alerts_rules_q.put(rules)121        except Exception:122            # if queue is empty load alerts from database123            df = load_user_alerts_df()124            user_alerts_rules_q.put(df)125            time.sleep(LOAD_ALERT_FREQ)126    return127def load_once_user_alerts(limit, offset):128    """ init user alerts once from DB then from q """129    results = UserAlert.query.limit(limit).offset(offset).all()130    json_results = []131    for result in results:132        data = {"id_": result.id_,133               'username': result.username,134               'user_alert_name': result.user_alert_name,135               'user_alert_scope': result.user_alert_scope,136               'user_alert_condition': result.user_alert_condition,137               'user_alert_keywords': result.user_alert_keywords,138               'user_alert_actions': result.user_alert_actions,139               'user_alert_triggered_state': result.user_alert_triggered_state,140               'user_alert_triggered_time': result.user_alert_triggered_time,141               'user_alert_delete': result.user_alert_delete,142               'user_alert_ext_op': result.user_alert_ext_op,143               'user_alert_ext_scope': result.user_alert_ext_scope,144               'user_alert_ext_condition': result.user_alert_ext_condition,145               'user_alert_ext_keywords': result.user_alert_ext_keywords,146               }147        json_results.append(data)148    return json_results149def load_user_profile(username):150    """ get user info from DB """151    result = UserProfile.query.filter_by(username=username).first()152    json_result = {'username': result.username,153                   'userpasswd': result.userpasswd,154                   'user_created_date': result.user_created_date,155                   'user_email': result.user_email,156                   'user_mobile': result.user_mobile,157                   'user_carrier': result.user_carrier158                   }159    return json_result160def get_messages_from_kj_main_t(context, user_alerts_pull_addr,161        in_messages_pool_q):162    """ Listen for messages to parse coming from main process163    and put them in a queue.164    """165    context = context166    alerts_pull_socket = context.socket(zmq.PULL)167    alerts_pull_socket.connect(USER_ALERTS_PULL_ADDR)168    while True:169        if VERBOSE == 1:170            l_string = MODULE_NAME + " waiting for new message " + str(171                datetime.now())172            logging.info(l_string)173        message = alerts_pull_socket.recv_pyobj()174        if VERBOSE == 1:175            l_string = MODULE_NAME + ("putting message in in_messages_q " +176             str(datetime.now()))177            logging.info(l_string)178        in_messages_pool_q.put(message)179        time.sleep(.1)180    return181def send_alert(target, user_alert_name, alert_out_message):182    """main SMTP handler"""183    comma_space = ', '184    dist_list = [target]185    smtpuser = 'msgs@wjtglobal.com'  # for SMTP AUTH, set SMTP username here186    smtppass = 'xxx'  # for SMTP AUTH, set SMTP password here187    msg = MIMEText(alert_out_message)188    msg['Subject'] = user_alert_name189    msg['From'] = 'alerts@finrocket.com'190    msg['To'] = comma_space.join(dist_list)191    mailServer = smtplib.SMTP('smtp.1and1.com', 587)192    mailServer.ehlo()193    mailServer.starttls()194    mailServer.ehlo()195    mailServer.login(smtpuser, smtppass)196    mailServer.sendmail(smtpuser, dist_list, msg.as_string())197    mailServer.close()198    return199def parse_rules(user_alerts_rules_q, new_rules_q):200    """ Get rules from q and parse them.  Create dictionary objects to201    quickly test for existence of a rule when messgaes come in.202    """203    sent_dict = {}204    sent_keys = [-2, -1.75, -1.5, -1.25, 0, 1.25, 1.5, 1.75, 2]205    for key in sent_keys:206        sent_dict[key] = []207    title_dict = {}208    title_not_dict = {}209    any_dict = {}210    any_not_dict = {}211    entities_dict = {}212    entities_not_dict = {}213    tickers_dict = {}214    tickers_not_dict = {}215    rules = user_alerts_rules_q.get()216    do_log("Got rules DF from user_alserts_rules_q" + str(rules))217    if VERBOSE == 1:218        l_string = MODULE_NAME + " loaded user alert rules " + (219            str(datetime.now()))220        logging.info(l_string)221    else:222        pass223    print rules224    for i, row in enumerate(rules.values):225        do_log("Row is " + str(i))226        do_log("Number of rules is " + str(len(rules.values)))227        id_, username, user_alert_name, user_alert_created_date, \228        user_alert_scope, user_alert_condition, user_alert_keywords, \229        user_alert_actions, user_alert_triggered_state, \230        user_alert_triggered_time, user_alert_delete, \231        user_alert_ext_op, user_alert_ext_scope, user_alert_ext_condition,\232        user_alert_ext_keywords = row233        print str(id_) + " " + user_alert_name + (234            " " + user_alert_scope + " " +235            user_alert_condition + " " + user_alert_keywords)236        user_alert_scope = user_alert_scope.upper()237        user_alert_condition = user_alert_condition.upper()238        rule_keywords = user_alert_keywords.split(' ')239        rule_keywords = set(rule_keywords)240        if user_alert_scope == 'TITLE' and (241            user_alert_condition == 'CONTAINS'):242            for key in rule_keywords:243                key = key.upper()244                if title_dict.get[key]:245                    title_dict[key].append(id_)246                else:247                    title_dict[key] = [id_]248        if user_alert_scope == 'TITLE' and (249            user_alert_condition == 'DOES NOT CONTAIN'):250            for key in rule_keywords:251                key = key.upper()252                if title_not_dict.get(key):253                    title_not_dict[key].append(id_)254                else:255                    title_not_dict[key] = [id_]256        if user_alert_scope == 'ANY' and (257            user_alert_condition == 'CONTAINS'):258            for key in rule_keywords:259                key = key.upper()260                if any_dict.get(key):261                    any_dict[key].append(id_)262                else:263                    any_dict[key] = [id_]264        if user_alert_scope == 'ANY' and (265            user_alert_condition == 'DOES NOT CONTAIN'):266            for key in rule_keywords:267                key = key.upper()268                if any_not_dict.get(key):269                    any_not_dict[key].append(id_)270                else:271                    any_not_dict[key] = [id_]272        if user_alert_scope == 'TICKER' and (273            user_alert_condition == 'CONTAINS'):274            for key in rule_keywords:275                key = key.upper()276                if tickers_dict.get(key):277                    tickers_dict[key].append(id_)278                else:279                    tickers_dict[key] = [id_]280        if user_alert_scope == 'TICKER' and (281            user_alert_condition == 'DOES NOT CONTAIN'):282            for key in rule_keywords:283                key = key.upper()284                if tickers_not_dict.get(key):285                    tickers_not_dict[key].append(id_)286                else:287                    tickers_not_dict[key] = [id_]288        if user_alert_scope == 'ENTITIES' and (289            user_alert_condition == 'CONTAINS'):290            for key in rule_keywords:291                key = key.upper()292                if entities_dict.get(key):293                    entities_dict[key].append(id_)294                else:295                    entities_dict[key] = [id_]296        if user_alert_scope == 'ENTITIES' and (297            user_alert_condition == 'DOES NOT CONTAIN'):298            for key in rule_keywords:299                key = key.upper()300                if entities_not_dict.get(key):301                    entities_not_dict[key].append(id_)302                else:303                    entities_not_dict[key] = [id_]304        if user_alert_scope == 'SENTIMENT':305            do_log("Processing SENTIMENT dictionary")306            do_log("Sentiment Alert processed for id_ " +307                str(id_) + " " + user_alert_scope + " " +308                    user_alert_condition + " " + str(user_alert_keywords))309            print user_alert_scope + " " + user_alert_condition310        if user_alert_scope == 'SENTIMENT' and user_alert_condition == '=':311            print user_alert_scope + " " + user_alert_condition312            user_alert_keywords = float(user_alert_keywords)313            for key in sent_dict.keys():314                if float(user_alert_keywords) == float(key):315                    print "Matched =", float(key), float(user_alert_keywords)316                    if sent_dict.get(key):317                        sent_dict[key].append(str(id_))318                        print sent_dict[key]319                        print str(id_) + (320                            "appending sent_dict for key " + str(key))321                    else:322                        #sent_dict[key = ser_alert_name323                        sent_dict[key] = [str(id_)]324                        print sent_dict[key]325                        print str(id_) + (326                            "creating entry in sent_dict for key " + str(key))327                else:328                    pass329        if user_alert_scope == 'SENTIMENT' and user_alert_condition == '>':330            print user_alert_scope + " " + user_alert_condition331            print "Keyword " + user_alert_keywords332            user_alert_keywords = float(user_alert_keywords)333            for key in sent_dict.keys():334                print "Sentiment Key: " + str(key)335                if float(user_alert_keywords) < float(key):336                    print "Matched >", float(key), float(user_alert_keywords)337                    if sent_dict.get(key):338                        sent_dict[key].append(str(id_))339                        print sent_dict[key]340                        print str(id_) + (341                            "appending sent_dict for key " + str(key))342                    else:343                        #sent_dict[key = ser_alert_name344                        sent_dict[key] = [str(id_)]345                        print sent_dict[key]346                        print str(id_) + (347                            "creating entry in sent_dict for key " + str(key))348                else:349                    pass350        if user_alert_scope == 'SENTIMENT' and user_alert_condition == '<':351            print user_alert_scope + " " + user_alert_condition352            user_alert_keywords = float(user_alert_keywords)353            for key in sent_dict.keys():354                if float(user_alert_keywords) > float(key):355                    print "Matched < ", float(key), float(user_alert_keywords)356                    if sent_dict.get(key):357                        sent_dict[key].append(str(id_))358                        print sent_dict[key]359                        print str(id_) + "appending sent_dict for key " + (360                            str(key))361                    else:362                        sent_dict[key] = str(id_)363                        print sent_dict[key]364                        print str(id_) + (365                            "creating entry in sent_dict for key " + str(key))366                else:367                    pass368        if user_alert_scope == 'SENTIMENT' and user_alert_condition == '=':369            print user_alert_scope + " " + user_alert_condition370            user_alert_keywords = float(user_alert_keywords)371            for key in sent_dict.keys():372                if float(user_alert_keywords) >= float(key):373                    print "Matched > ", float(key), float(user_alert_keywords)374                    if sent_dict.get(key):375                        sent_dict[key].append(str(id_))376                        print sent_dict[key]377                        print str(id_) + "appending sent_dict for key " + (378                            str(key))379                    else:380                        #sent_dict[key = user_alert_name381                        sent_dict[key] = [str(id_)]382                        print sent_dict[key]383                        print str(id_) + (384                            "creating entry in sent_dict for key " + str(key))385                else:386                    pass387        if user_alert_scope == 'SENTIMENT' and user_alert_condition == '=':388            print user_alert_scope + " " + user_alert_condition389            for key in sent_dict.keys():390                if float(user_alert_keywords) <= float(key):391                    print "Matched >", float(key), float(user_alert_keywords)392                    if sent_dict.get(key):393                        sent_dict[key].append(str(id_))394                        print sent_dict[key]395                        print str(id_) + " appending sent_dict for key " + (396                            str(key))397                    else:398                        sent_dict[key] = [str(id_)]399                        print sent_dict[key]400                        print str(id_) + (401                            "creating entry in sent_dict for key " + str(key))402                else:403                    pass404    print "Parsed all rules into dictionaries"405    print "Title Keys:" + str(title_dict.keys())406    print "Title Not Keys:" + str(title_not_dict.keys())407    print "Any Keys:" + str(any_dict.keys())408    print "Any Not Keys:" + str(any_not_dict.keys())409    print "Entities Keys:" + str(entities_dict.keys())410    print "Entities Not Keys:" + str(entities_not_dict.keys())411    print "Tickers Keys:" + str(tickers_dict.keys())412    print "Tickers Not Keys:" + str(tickers_not_dict.keys())413    if VERBOSE == 1:414        l_string = MODULE_NAME + " Parsed all rules into dictionaries " + (415            str(datetime.now()))416        logging.info(l_string)417    else:418        pass419    out_list = [rules, sent_dict, title_dict, any_dict,420              entities_dict, tickers_dict, any_not_dict,421              title_not_dict, entities_not_dict, tickers_not_dict]422    new_rules_q.put(out_list)423    return out_list424def get_new_rules(new_rules_q):425    """ Check queue for new rules. """426    print "Getting new rules"427    if VERBOSE == 1:428        l_string = MODULE_NAME + " Getting new rules " + (429            str(datetime.now()))430        logging.info(l_string)431    else:432        pass433    try:434        # get rules from queue if they exist435        new_rules = (rules, sent_dict, title_dict,436        any_dict, entities_dict, tickers_dict,437        any_not_dict, title_not_dict, entities_not_dict,438        tickers_not_dict) = new_rules_q.get_nowait()439    except Exception(Queue.Empty):440        print "No new rules to get"441    return new_rules442def get_new_message(in_messages_pool_q):443    """ Loop on message queue.get """444    while True:445        try:446                #see if we have a new message447                message = in_messages_pool_q.get()448                print "Message ", len(message)449                if VERBOSE == 1:450                    l_string = MODULE_NAME + "Message length " + (451                        str(len(message)) + " " + str(datetime.now()))452                    logging.info(l_string)453                else:454                    pass455                yield message456                pass457        except Queue.Empty:458            time.sleep(.1)459            pass460def parse_message(message):461    """ Parse fields from dict object received from main KJ process"""462    message = json.loads(message)463    m_title = message['m_title']464    m_description = message['m_description']465    m_sentiment = message['m_sentiment']466    m_tickers = message['m_tickers']467    m_entities = message['m_entities']468    m_link = message['m_link']469    alert_out_message = m_title[0:20]+" S"+m_sentiment+" "+m_link470    return (m_title, m_description, m_sentiment, m_tickers,471        m_entities, m_link), alert_out_message472def process_sent_dict(alert_dict, sent_dict, m_sentiment, alerts_fired):473    """ Check to see if sentiment alert is triggered. """474    alert_dict = sent_dict475    log_msg = "Processing " + str(alert_dict)476    do_log(log_msg)477    if m_sentiment:478        key = float(m_sentiment)479        alerts_fired.append(alert_dict[key])480        log_msg = "Alerts fired contains a sentiment alert " + (481            str(alerts_fired))482        do_log(log_msg)483    return alerts_fired484def get_any_tokens(m_title, m_description):485    """ Make tokens out of title and description text. """486    message = nltk.clean_html(m_description)487    log_msg = "Message len after m_description html clean:" + (488        str(len(message)))489    do_log(log_msg)490    if nltk.clean_html(m_title):491        m_title = nltk.clean_html(m_title)492        message = message + " " + m_title493        log_msg = "Message has length after title html clean:" + (494            str(len(message)))495        do_log(log_msg)496    else:497        log_msg = (498            "Error processing m_title with nltl.clean_html ")499        do_log(log_msg)500    message = nltk.word_tokenize(message)501    punctuation = re.compile(r'[-.?!,&":;()|0-9]')502    tokens = [punctuation.sub(" ", token) for token in message]503    log_msg = str(tokens)504    do_log(log_msg)505    return tokens506def process_any_dict(alert_dict, any_dict, m_title, m_description,507    alerts_fired):508    """ If title or description contain matching text alert is509    triggered.510    """511    log_msg = "Processing matches for ANY CONTAINS"512    do_log(log_msg)513    print log_msg514    alert_dict = any_dict515    log_msg = " processing " + str(alert_dict)516    do_log(log_msg)517    tokens = get_any_tokens(m_title, m_description)518    for token in set(tokens):519        token = token.upper()520        log_msg = "Looking for: " + token521        do_log(log_msg)522        if alert_dict.get(token):523            alerts_fired.append(alert_dict[token])524            log_msg = "Added " + str(alert_dict[token]) + (525                " to alerts_fired")526            do_log(log_msg)527        else:528            log_msg = "Token not found in alert_dict: " + token529            do_log(log_msg)530    return alerts_fired531def process_any_not_dict(alert_dict, any_not_dict, m_title, m_description,532     alerts_fired):533    """ If title or description contain matching (NOT) text alert534    is triggered.535    """536    log_msg = "Processing matches for ANY DOES NOT CONTAIN"537    do_log(log_msg)538    alert_dict = any_not_dict539    log_msg = "processing " + str(alert_dict)540    do_log(log_msg)541    tokens = get_any_tokens(m_title, m_description)542    count = 0543    for token in set(tokens):544        token = token.upper()545        log_msg = "Looking for: " + token546        do_log(log_msg)547        if alert_dict.get(token):548            count += 1549        else:550            pass551    if count == 0:552        for key in alert_dict.keys():553            alerts_fired.append(alert_dict[key])554    return alerts_fired555def get_title_tokens(m_title):556    """ Make tokens out of title text, """557    message = nltk.clean_html(m_title)558    message = nltk.word_tokenize(message)559    punctuation = re.compile(r'[-.?!,&":;()|0-9]')560    tokens = [punctuation.sub(" ", token) for token in message]561    return tokens562def process_title_dict(alert_dict, title_dict, m_title, alerts_fired):563    """ If title contains matching text alert is triggered. """564    log_msg = "Processing TITLE matches for CONTAINS"565    do_log(log_msg)566    alert_dict = title_dict567    log_msg = "processing " + str(alert_dict)568    do_log(log_msg)569    tokens = get_title_tokens(m_title)570    for token in tokens:571        token = token.upper()572        log_msg = "Looking for: " + token573        do_log(log_msg)574        if title_dict.get(token):575            alerts_fired.append(title_dict[token])576        else:577            pass578    return alerts_fired579def process_title_not_dict(alert_dict, title_not_dict, m_title, alerts_fired):580    """ If title does not contain matching text alert is triggered. """581    log_msg = "Processing TITLE matches for DOES NOT CONTAIN"582    do_log(log_msg)583    alert_dict = title_not_dict584    log_msg = "processing " + str(alert_dict)585    do_log(log_msg)586    tokens = get_title_tokens(m_title)587    count = 0588    for token in tokens:589        token = token.upper()590        log_msg = "Looking for: " + token591        do_log(log_msg)592        if alert_dict.get(token):593            count += 1594        else:595            pass596    if count == 0:597        for key in alert_dict.keys():598            alerts_fired.append(alert_dict[key])599    return alerts_fired600def process_tickers_dict(alert_dict, tickers_dict, m_tickers, alerts_fired):601    """ If tickers contains matching symbol alert is triggered. """602    log_msg = "Processing TICKERS matches for CONTAINS"603    do_log(log_msg)604    alert_dict = tickers_dict605    log_msg = "processing " + str(alert_dict)606    do_log(log_msg)607    for token in set(m_tickers.split(',')):608        token = token.upper()609        log_msg = "Looking for: " + token610        do_log(log_msg)611        if alert_dict.get(token):612            alerts_fired.append(alert_dict[token])613        else:614            pass615    return alerts_fired616def process_tickers_not_dict(alert_dict, tickers_not_dict, m_tickers,617     alerts_fired):618    """ If tickers does not contain matching symbol alert is triggered. """619    alert_dict = tickers_not_dict620    log_msg = "processing " + str(alert_dict)621    do_log(log_msg)622    log_msg = "Processing TICKERS matches for DOES NOT CONTAIN"623    do_log(log_msg)624    for token in set(m_tickers.split(',')):625        token = token.upper()626        log_msg = "Looking for: " + token627        do_log(log_msg)628        count = 0629        if alert_dict.get(token):630            count += 1631        else:632            pass633        if count == 0:634            for key in alert_dict.keys():635                alerts_fired.append(alert_dict[key])636    return alerts_fired637def process_entities_dict(alert_dict, entities_dict, m_entities, alerts_fired):638    """ If entities contains matching name alert is triggered. """639    log_msg = "Processing ENTITIES matches for CONTAINS"640    do_log(log_msg)641    alert_dict = entities_dict642    for token in set(m_entities.split(',')):643        token = token.upper()644        log_msg = "Looking for: " + token645        do_log(log_msg)646        if alert_dict.get(token):647            alerts_fired.append(alert_dict[token])648        else:649            pass650    return alerts_fired651def process_entities_not_dict(alert_dict, entities_not_dict, m_entities,652     alerts_fired):653    """ If entities does contain matching name alert is triggered. """654    alert_dict = entities_not_dict655    log_msg = "processing " + str(alert_dict)656    do_log(log_msg)657    log_msg = "Processing ENTITIES matches for DOES NOT CONTAIN"658    do_log(log_msg)659    count = 0660    for token in set(m_entities.split(',')):661        token = token.upper()662        log_msg = "Looking for: " + token663        do_log(log_msg)664        if alert_dict.get(message[token]):665            count += 1666        else:667            pass668    if count == 0:669        for key in alert_dict.keys():670            alerts_fired.append(entities_not_dict[key])671    return alerts_fired672def process_alerts_fired(alerts_fired, rules, alert_out_message):673    """ Process alerts_fired list.  """674    log_msg = "Beginning processing of alerts_fired" + (675                    str(alerts_fired))676    do_log(log_msg)677    alerts_list = []678    for id_ in alerts_fired:679        for i in id_:680            do_log("alerts_fired contains " + str(alerts_fired))681            do_log("alerts_fired id_[0] = " + str(i))682            alerts_list.append(i)683            do_log("Added " + str(i) + " to alerts_list -> " + (684                str(alerts_list)))685    alerts_list = set(alerts_list)686    do_log("alerts_list contains: " + str(alerts_list))687    print "alerts_list is:" + str(alerts_list)688    try:689        for alert_ in alerts_list:690            do_log("Alert is type " + str(type(alert_)))691            log_msg = "Processing profile for alert " + str(alert_)692            do_log(log_msg)693            do_log(str(rules))694            rule_df = DataFrame()695            rule_df = rules[rules['id_'].isin([int(alert_), ])]696            do_log("rule_df contains: " + str(rule_df))697            if rule_df:698                for i, row in enumerate(699                    rule_df['username'].values):700                    username = row701                    username = str(username)702                    do_log(username)703                for i, row in enumerate(704                    rule_df['user_alert_name'].values):705                    user_alert_name = row706                    user_alert_name = str(user_alert_name)707                    do_log("Found " + user_alert_name)708            try:709                #get user profile to determine alert actions710                user_profile = load_user_profile(username)711                if user_alert_actions == 'TEXT':712                    #just send an sms713                    target = user_profile['user_mobile'] + "@" + (714                        user_profile['user_carrier'])715                    send_alert(target, user_alert_name,716                        alert_out_message)717                    log_msg = "Sending TEXT to " + str(target)718                    do_log(log_msg)719                    send_alert(target, user_alert_name,720                        alert_out_message)721                if user_alert_actions == 'TEXT & EMAIL':722                    #send both sms and email723                    target = (str(user_profile['user_mobile'])724                        + "@"725                        + str(user_profile['user_carrier'])726                        + ".com")727                    log_msg = ("Sending TEXT and EMAIL to "728                        + str(target))729                    do_log(log_msg)730                    send_alert(target, user_alert_name,731                        alert_out_message)732                    target = str(user_profile['user_email'])733                    log_msg = "Sending EMAIL to " + str(target)734                    do_log(log_msg)735                    send_alert(target, user_alert_name,736                        alert_out_message)737                if user_alert_actions == 'EMAIL':738                    log_msg = "Sending EMAIL to " + str(target)739                    do_log(log_msg)740                    target = str(user_profile['user_email'])741                    send_alert(target, user_alert_name,742                        alert_out_message)743            except Exception as error:744                do_log(str(error) + " in Sending function for " + (745                    str(username) + " alert: " + str(alert_)))746    except Exception as error:747        do_log(str(error) + " in processing profile " + (748            str(username) + " alert: " + str(alert_)))749        pass750def process_message_p(in_messages_pool_q, out_messages_q, new_rules_q):751    """ Main function. Load alerts from queue then filter incoming752    messages with alert key words and conditions """753    while True:754        alerts_fired = []755        rules, sent_dict, title_dict, any_dict, \756        entities_dict, tickers_dict, any_not_dict, \757        title_not_dict, entities_not_dict, \758        tickers_not_dict = get_new_rules(new_rules_q)759        message, alert_out_message = get_new_message(in_messages_pool_q)760        alert_dict = {}761        m_title, m_description, m_sentiment, m_tickers, \762         m_entities, m_link = parse_message(message)763        alerts_fired = process_sent_dict(alert_dict, sent_dict,764         m_sentiment, alerts_fired)765        alerts_fired = process_any_dict(alert_dict, any_dict,766         m_title, m_description, alerts_fired)767        alerts_fired = process_any_not_dict(alert_dict, any_not_dict,768         m_title, m_description, alerts_fired)769        alerts_fired = process_title_dict(alert_dict, title_dict, m_title,770         alerts_fired)771        alerts_fired = process_title_not_dict(alert_dict, title_not_dict,772         m_title, alerts_fired)773        alerts_fired = process_tickers_dict(alert_dict, tickers_dict,774         m_tickers, alerts_fired)775        alerts_fired = process_tickers_not_dict(alert_dict, tickers_not_dict,776         m_tickers, alerts_fired)777        alerts_fired = process_entities_dict(alert_dict, entities_dict,778         m_entities, alerts_fired)779        alerts_fired = process_entities_not_dict(alert_dict,780         entities_not_dict, m_entities, alerts_fired)781        process_alerts_fired(alerts_fired, rules, alert_out_message)782def start_module():783    """ Setup queues and start threads and processes """784    in_messages_pool_q = Queue()785    out_messages_q = Queue()786    user_alerts_rules_q = Queue()787    new_rules_q = Queue()788    for i in range(0, 1):789        get_messages_t = threading.Thread(target=get_messages_from_kj_main_t,790            args=(UA_CONTEXT, USER_ALERTS_PULL_ADDR, in_messages_pool_q))791        get_messages_t.setDaemon(False)792        get_messages_t.start()793    for i in range(0, 1):794        load_user_alerts_t = threading.Thread(target=load_user_alerts_in_q,795            args=(user_alerts_rules_q))796        load_user_alerts_t.setDaemon(False)797        load_user_alerts_t.start()798    for i in range(0, 1):799        proc_rules_p = Process(target=parse_rules,800            args=(user_alerts_rules_q, new_rules_q,))801        proc_rules_p.start()802    for i in range(0, 1):803        proc_messages_p = Process(target=process_message_p,804            args=(in_messages_pool_q, out_messages_q, new_rules_q,))805        proc_messages_p.start()...

grammar_test.py

Source:grammar_test.py

...489            rule_positional('ABC', 123, '=', '+')490                =491                'a'492                ;493            rule_keywords(k1=ABC, k3='=', k4='+', k2=123)494                =495                'b'496                ;497            rule_all('DEF', 456, '=', '+', k1=HIJ, k3='=', k4='+', k2=789)498                =499                'c'500                ;501        '''502        pretty = '''503            start504                =505                {rule_positional | rule_keywords | rule_all} $506                ;507            rule_positional(ABC, 123, '=', '+')508                =509                'a'510                ;511            rule_keywords(k1=ABC, k3='=', k4='+', k2=123)512                =513                'b'514                ;515            rule_all(DEF, 456, '=', '+', k1=HIJ, k3='=', k4='+', k2=789)516                =517                'c'518                ;519        '''520        model = genmodel('RuleArguments', grammar)521        self.assertEqual(trim(pretty), ustr(model))522        model = genmodel('RuleArguments', pretty)523        ast = model.parse("a b c")524        self.assertEqual(['a', 'b', 'c'], ast)525        semantics = TC36Semantics()526        ast = model.parse("a b c", semantics=semantics)527        self.assertEqual(['a', 'b', 'c'], ast)528        codegen(model)529    def test_36_unichars(self):530        grammar = '''531            start = { rule_positional | rule_keywords | rule_all }* $ ;532            rule_positional("ÃÃÃÃ¤Ã¶Ã¼Ã") = 'a' ;533            rule_keywords(k1='Ã¤Ã¶Ã¼ÃÃÃÃ') = 'b' ;534            rule_all('ÃÃÃÃÃ¤Ã¶Ã¼', k1="ÃÃ¤Ã¶Ã¼ÃÃÃ") = 'c' ;535        '''536        def _trydelete(pymodule):537            import os538            try:539                os.unlink(pymodule + ".py")540            except EnvironmentError:541                pass542            try:543                os.unlink(pymodule + ".pyc")544            except EnvironmentError:545                pass546            try:547                os.unlink(pymodule + ".pyo")...

dataloader.py

Source:dataloader.py

1# Imports2import torch3import numpy as np4import logging5import pickle6import os7import pytorch_lightning as pl8# Submodules9from typing import Union, List10from tqdm import tqdm, trange11from torch.utils.data import Dataset, TensorDataset12from snorkel.labeling import LFApplier13from snorkel_utils import make_keyword_lf14# Need to set tokenizers_parallelism environment variable to avoid lots of warnings15os.environ["TOKENIZERS_PARALLELISM"] = "false"16# Set up logging17logger = logging.getLogger('__file__')18# Collate function for RPNDataset19class RPNCollate():20    def __init__(self, tokenizer):21        # self.id2word = id2word22        self.tokenizer = tokenizer23    def __call__(self, batch):24        '''25        Collate function to turn batch from dataloader into clean dict of output26        '''27        # print(batch)28        # print("Length", len(batch))29        # seq, attn_mask, labels, noisy_labels, noised_ids, mlm_labels, starts, ends = *batch30        input_ids = torch.stack(tuple([x['input_ids'] for x in batch]))31        attn_mask = torch.stack(tuple([x['attention_masks'] for x in batch]))32        labels = torch.stack(tuple([x['labels'] for x in batch]))33        noisy_labels = torch.stack(tuple([x['noisy_labels'] for x in batch]))34        soft_labels = torch.stack(tuple([x['soft_labels'] for x in batch]))35        noised_ids = torch.stack(tuple([x['noised_ids'] for x in batch]))36        mlm_labels = torch.stack(tuple([x['mlm_labels'] for x in batch]))37        starts = [x['word_starts'] for x in batch]38        ends = [x['word_ends'] for x in batch]39        # Get batch indices and start/end indices of each word40        batch_inds = torch.cat(tuple([i*torch.ones_like(s).long() for i, s in enumerate(starts)])).reshape(-1,1)41        starts = torch.cat(tuple(starts)).reshape(-1,1)42        ends = torch.cat(tuple(ends)).reshape(-1,1)43        # Get tensor to select ids and/or embeddings for each word from a tensor44        word_lengths = ends-starts45        max_len = word_lengths.max()46        selector_inds = starts + torch.arange(max_len)47        selector_mask = (selector_inds < ends)48        selector_inds[~selector_mask] = 049        # Get all words in the batch to be used for creating phrase-based rules50        batch_words = reconstruct_words(input_ids, starts, ends, self.tokenizer, batch_inds=batch_inds)51        output_dict = {52                        'input_ids': input_ids, 53                        'attention_masks': attn_mask, 54                        'labels': labels, 55                        'noisy_labels': noisy_labels, 56                        'noised_ids': noised_ids, 57                        'mlm_labels': mlm_labels,58                        'batch_inds': batch_inds,59                        'word_starts':starts, 60                        'word_ends': ends,61                        'word_inds': selector_inds,62                        'word_mask': selector_mask,63                        'batch_words': batch_words,64                        'soft_labels': soft_labels65                        }66        return output_dict67# Helper functions68# def reconstruct_words(input_ids, starts, ends, id2word, batch_inds=None):69def reconstruct_words(input_ids, starts, ends, tokenizer, batch_inds=None):70    '''71    Reconstruct all words in text from their input ids72    '''73    words = []74    ss = starts.flatten()75    es = ends.flatten()76    if batch_inds is not None:77        bs = batch_inds.flatten()78        words = [tokenizer.decode(input_ids[b, s:e]) for b, s, e in zip(bs, ss, es)]79        # for (b, s, e) in zip(bs, ss, es):80            # if s - e == 1:81            #     words.append[id2word[input_ids[b, s:e].item()]]82            # else:83            #     subword_ids = input_ids[b, s:e].numpy()84            #     words.append(tokenizer.decode(subword_ids))85                # words.append(merge_tokens(subword_ids, id2word))86    else:87        words = [tokenizer.decode(input_ids[s:e]) for s, e in zip(ss, es)]88        # for (s, e) in zip(ss, es):89            # if s - e == 1:90            #     words.append[id2word[input_ids[s:e].item()]]91            # else:92            #     subword_ids = input_ids[s:e].numpy()93            #     words.append(tokenizer.decode(subword_ids))94                # words.append(merge_tokens(subword_ids, id2word))95    return words96# def merge_tokens(subword_ids, id2word):97#     '''98#     Merge tokens from subword units99#     '''100    # tokens = [id2word[i] for i in subword_ids]101    # s = tokens[0]102    # for t in tokens[1:]:103    #     if t.startswith('##'):104    #         s += t[2:]105    #     else:106    #         s += ' ' + t107    # return s108def get_word_spans(word_ids, punct_inds=None):109        '''110        Get spans of whole words list of wordpiece -> word mappings111        Params:112        -------113            word_ids: List114                List of which word is mapped to each individual token115          116        Returns:117        --------118            span_starts: torch.LongTensor119                Array of starts of word spans120            span_ends: torch.LongTensor121                Array of ends of word spans122        Example:123        --------124            Sentence:   "the dog jumped excitedly"125            -> Tokenized:  ['[CLS]', 'the','dog', 'jump', '##ed', 'excit', '##ed', '##ly', '[SEP]']126            -> word_ids:   [None, 0, 1, 2, 2, 3, 3, 3, None]127            -> Spans: [(0,0), (1,2), (2,3), (3,5), (5,8), (0,0)]128            Usage: self.get_word_spans(word_ids) #word_ids as above129                -> returns: (tensor([1, 2, 3, 5]), tensor([2, 3, 5, 8]))130        131        '''132        prev_ind = None133        starts = []134        ends = []135        # Gather start and end indices136        for i, ind in enumerate(word_ids):137            if prev_ind != ind:138                if prev_ind != None:139                    ends.append(i)140                if ind != None:141                    starts.append(i)142            prev_ind = ind143        # Return tensors144        return (torch.LongTensor(starts), torch.LongTensor(ends))145def prep_data(text, tokenizer, max_length=128):146    '''147    Prep data for RPN usage148    '''149    enc = tokenizer(text, max_length=max_length, padding=True, truncation=True, return_tensors='pt', return_token_type_ids=False)150    # Portion out different vaues151    encoded_text = enc['input_ids']152    attention_masks = enc['attention_mask']153    # Get word start/end indices154    word_spans = [get_word_spans(enc.word_ids(i)) for i in trange(len(text))]155    word_starts = [s[0] for s in word_spans]156    word_ends = [s[1] for s in word_spans]157    return encoded_text, attention_masks, word_starts, word_ends158class RPNDataset(Dataset):159    # RPN Dataset to mask keywords used in rules160    def __init__(self, 161                 data, 162                 tokenizer,163                 rule_keywords, 164                 rule_tokens=[],165                 mask_prob=.1, 166                 rule_mask_prob=.5,167                 seed_labels=None, 168                 filter_labels=True,169                 max_length=128, 170                 min_lf=1,171                 ):172        self.text = data['text']173        self.tokenizer = tokenizer174        if 'rule_keywords' in data:175            self.rule_keywords = data['rule_keywords']176        else:177            self.rule_keywords = rule_keywords178        # Tokenizer attributes179        self.word2id = tokenizer.vocab180        self.mask_id = self.word2id['[MASK]']181        self.id2word = {v:k for k, v in self.word2id.items()}182        self.max_length = max_length183        # Make sure data is ready for deep learning models184        if 'encoded_text' not in data.keys():185            self.prepare_data()186        else:187            self.encoded_text = data['encoded_text']188            self.attention_masks = data['attention_masks']189            self.word_starts = data['word_starts']190            self.word_ends = data['word_ends']191        self.labels = data['labels']192        if 'word_lists' in data.keys():193            self.word_lists = data['word_lists']194        else:195            logger.info("Computing word lists")196            self.word_lists = [reconstruct_words(ids, starts, ends, self.tokenizer) 197                                for (ids, starts, ends) in tqdm(zip(self.encoded_text, 198                                                                self.word_starts, 199                                                                self.word_ends))]200        # Make sure noisy labels are there201        self.min_lf = min_lf202        if 'noisy_labels' not in data:203            self.make_lfs(rpn_generated=False)204            self.make_noisy_labels()205        else:206            self.noisy_labels = data['noisy_labels']207        self.balance_noisy_labels()208        if 'soft_labels' in data:209            self.soft_labels = data['soft_labels']210        else:211            soft_labels = None212        # self.soft_labels = data['soft_labels']213        214        215        # labeled_inds = ((self.noisy_labels >= 0).sum(dim=1) >= min_lf).nonzero().flatten()216        # logger.debug(labeled_inds.size)217        # logger.debug(f'Proportion labeled: {labeled_inds.size(0)/self.noisy_labels.size(0)}')218        # self.labeled_inds = labeled_inds219        220        # Get vocab size221        self.vocab_size = int(np.max(list(self.word2id.values())) + 1)222        self.num_special_tokens = int(np.max([val for key, val in self.word2id.items() if key.startswith('[')]) + 1)223        # Rule attributes224        self.rule_tokens = rule_tokens225        self.rule_map = {val:val for val in self.word2id.values()}226        self.update_rule_map(rule_tokens)227        self.is_rule = {val:0 for val in self.word2id.values()}228        for w in rule_tokens:229            if w.strip() in self.word2id:230                self.is_rule[self.word2id[w.strip()]] = 1231    232        # Misc attributes233        self.p = mask_prob234        self.rule_p = rule_mask_prob235        self.length = len(self.text)236        self.idx_map = {i:i for i in range(self.length)}237    def prepare_data(self,):238        '''239        Prepare data by tokenizing, padding, and getting word start/end indices240        Params:241        -------242            text: List[str]243                List of text of each instance244        '''245        # Encode text246        enc = self.tokenizer(self.text, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt', return_token_type_ids=False)247        # Portion out different vaues248        self.encoded_text = enc['input_ids']249        self.attention_masks = enc['attention_mask']250        # Get word start/end indices251        word_spans = [get_word_spans(enc.word_ids(i)) for i in trange(len(self.text))]252        self.word_starts = [s[0] for s in word_spans]253        self.word_ends = [s[1] for s in word_spans]254    # Make more general to apply to n-grams/phrases255    def make_lfs(self, rpn_generated=True):256        '''257        Make labeling functions from keywords/phrases258        '''259        self.keyword_lfs = [make_keyword_lf(w, label, rpn_generated=rpn_generated) for label, words in self.rule_keywords.items() for w in words if not ' ' in w]260        self.phrase_lfs = [make_keyword_lf(w, label, rpn_generated=rpn_generated) for label, words in self.rule_keywords.items() for w in words if ' ' in w]261        262    def make_noisy_labels(self):263        '''264        Make noisy labels from labeling functions265        '''266        if len(self.keyword_lfs) > 0:267            keyword_applier = LFApplier(lfs=self.keyword_lfs)268            keyword_noisy_labels = torch.LongTensor(keyword_applier.apply(self.word_lists))269            noisy_labels = keyword_noisy_labels270        if len(self.phrase_lfs) > 0:271            phrase_applier = LFApplier(lfs=self.phrase_lfs)272            phrase_noisy_labels = torch.LongTensor(phrase_applier.apply(self.text))273            noisy_labels = phrase_noisy_labels274        if len(self.keyword_lfs) > 0 and len(self.phrase_lfs) > 0:275            noisy_labels = torch.cat((keyword_noisy_labels, phrase_noisy_labels), dim=1)276        self.full_noisy_labels = noisy_labels277    def balance_noisy_labels(self):278        '''279        Balance number of noisy labels for each class to prevent model imbalance280        '''281        self.noisy_labels = self.full_noisy_labels.clone()282        label_counts = [(self.noisy_labels == label).sum().item() for label in self.rule_keywords.keys()]283        logger.debug(f"Old label counts: {label_counts}")  284        # Balance classes285        count_min = min(label_counts)286        for label in self.rule_keywords.keys():287            count = (self.noisy_labels == label).sum()288            cutoff = (count - count_min)/count289            mask = (torch.rand(self.noisy_labels.size()) < cutoff) & (self.noisy_labels == label)290            self.noisy_labels[mask] = -1291        label_counts = [(self.noisy_labels == label).sum() for label in self.rule_keywords.keys()]292        logger.debug(f"New label counts: {label_counts}")  293        labeled_inds = ((self.noisy_labels >= 0).sum(dim=1) >= self.min_lf).nonzero().flatten()294        # logger.debug(labeled_inds.size)295        logger.debug(f'Proportion labeled: {labeled_inds.size(0)/self.noisy_labels.size(0)}')296        self.labeled_inds = labeled_inds297        298    def _use_labeled(self):299        '''300        Switches model to only iterate through labeled data301        '''302        labeled_inds = ((self.noisy_labels >= 0).sum(dim=1) >= self.min_lf).nonzero().flatten()303        self.labeled_inds = labeled_inds304        self.length = self.labeled_inds.size(0)305        self.idx_map = {i:self.labeled_inds[i] for i in range(self.length)}306        # Debugging statements307        # logger.debug(labeled_inds.size)308        logger.debug(f'Proportion labeled: {labeled_inds.size(0)/self.noisy_labels.size(0)}')309        310        # return noisy_labels311    # def precompute_phrase_counts(self):312    #     '''313    #     Precompute word counts for faster model training314    #     '''315    #     phrase_counts = defaultdict(int)316    #     phrase_inds = defaultdict(set)317    #     normalized_text = []318    #     logger.info("Precomputing phrase counts")319    #     for j, word_list in enumerate(tqdm(self.train['word_lists'])):320    #         normalized_text.append(" ".join(word_list))321    #         # normalized_text.append(self.tokenizer.decode(self.tokenizer.encode(word_list)[1:-1]))322    #         for l in range(1, 1 + self.args.max_rule_length):323    #             phrases = [" ".join(word_list[i:i+l]) for i in range(len(word_list) - l + 1)]324    #             for p in phrases:325    #                 if any([punct in p for punct in '.,!?"\\']):326    #                     continue327    #                 phrase_counts[p] += 1328    #                 phrase_inds[p].add(j)329    #     self.train['text'] = normalized_text330    #     self.phrase_counts = {k:v for k, v in phrase_counts.items() if v >= self.min_count_cutoff and k not in self.words_to_exclude}331    #     logger.debug(f"Num Phrases: {len(self.phrase_counts)}")332    #     self.phrase_inds = {k:list(phrase_inds[k]) for k in self.phrase_counts.keys()}333    def update_rule_map(self, kwds):334        for kwd in kwds:335            self.rule_map[kwd] = self.mask_id336    def token_match(self, token, alg='random', n=5):337        '''338        Match examples based on token339        '''340        pass341    def phrase_match(self, phrase, alg='random', n=5):342        '''343        Match examples based on phrase344        '''345        pass346    # Needs updating for whole words/phrases347    def noise_input_tokens(self, seq, p=1):348        '''349        Add noise to input sequences for MLM loss350        Inputs:351        -------352            seq: Input sequence on which to mask tokens353            p: Probability with which to mask each token from a rule354        '''355        rule_tokens = torch.tensor([self.is_rule[w.item()] for w in seq]).bool()356        # rule_mask_ps = (torch.ones_like(rule_tokens) * p)357        # rule_draws = torch.bernoulli(rule_mask_ps).bool()358        # masked_rule_tokens = (rule_tokens & rule_draws)359        # MLM Loss360        ps = self.p * torch.ones_like(seq)361        mlm_mask = (torch.bernoulli(ps).bool() & (seq >= self.num_special_tokens))362        # mask = (mlm_mask | masked_rule_tokens)363        mask = (mlm_mask | rule_tokens)364        # # Debugging365        # if rule_tokens.sum() > 0:366        #     logger.debug(rule_tokens.sum())367        # if mlm_mask.sum() != mask.sum():368        #     logger.debug(f"mlm_mask: {mlm_mask.sum()}")369        #     logger.debug(f"mask: {mask.sum()}")370        #     logger.debug("mask should be larger")371        # Labels372        mlm_labels = seq.clone()373        mlm_labels[~mask] = -100374        # Get masks of how to noise tokens375        a = torch.rand(seq.size())376        mask_token_locs = (mask & (a < .8))377        random_token_locs = (mask & (a > .9))378        num_random = random_token_locs.sum()379        random_tokens = torch.randint(low=self.num_special_tokens, 380                                      high=self.vocab_size, 381                                      size=(num_random.item(),))382        # Noise input ids383        noised_ids = seq.clone()384        noised_ids[mask_token_locs] = self.mask_id385        noised_ids[random_token_locs] = random_tokens386        return noised_ids, mlm_labels387        388    def __len__(self):389        return self.length390    def __getitem__(self, i):391        idx = self.idx_map[i]392    393        seq = self.encoded_text[idx]394        attn_mask = self.attention_masks[idx]395        labels = self.labels[idx]396        noisy_labels = self.noisy_labels[idx]397        noised_ids, mlm_labels = self.noise_input_tokens(seq)398        starts = self.word_starts[idx]399        ends = self.word_ends[idx]400        soft_labels = self.soft_labels[idx]401        output_dict = {'input_ids': seq, 402                       'attention_masks': attn_mask, 403                       'labels': labels, 404                       'noisy_labels':noisy_labels, 405                       'noised_ids': noised_ids, 406                       'mlm_labels': mlm_labels, 407                       'word_starts':starts, 408                       'word_ends': ends,409                       'soft_labels': soft_labels,410                       }411        # return seq, attn_mask, labels, noisy_labels, noised_ids, mlm_labels, starts, ends412        return output_dict 413    def save(self, filepath):414        '''415        Save data module to file416        '''417        with open(filepath, 'wb') as f:418            pickle.dump(self.__dict__, f)419    @classmethod420    def load(self, filepath):421        '''422        Load data module from file423        '''424        with open(filepath, 'wb') as f:425            self.__dict__ = pickle.load(f)426class RegalDataset(Dataset):427    # RPN Dataset to mask keywords used in rules428    def __init__(self, 429                 text,430                 encoded_text,431                 attention_masks, 432                 labels,433                 tokenizer, 434                 rules,435                 mask_prob=.1):436        '''437        Initialize dataset class438        Inputs:439            text: List of str440                Input text of datapoints to classify441            labels: List of torch.LongTensor442                Labels corresponding to each datapoint443            tokenizer: 444                Huggingface tokenizer object to encode text445            Rules: List of Rule446                Labeling functions to create noisy labels447        '''448        self.text = data['text']449        self.encoded_text = data['encoded_text']450        self.attention_masks = data['attention_masks']451        self.labels = data['labels']452        self.noisy_labels = data['noisy_labels']453        # Tokenizer attributes454        self.tokenizer = tokenizer455        self.word2id = tokenizer.vocab456        self.mask_id = self.word2id['[MASK]']457    458        # Get vocab size459        self.vocab_size = int(np.max(list(self.word2id.values())) + 1)460        self.num_special_tokens = int(np.max([val for key, val in self.word2id.items() if key.startswith('[')]) + 1)461        # Rule attributes462        self.rule_tokens = rule_tokens463        self.rule_map = {val:val for val in self.word2id.values()}464        self.update_rule_map(rule_tokens)465        self.is_rule = {val:0 for val in self.word2id.values()}466        for w in rule_tokens:467            self.is_rule[self.word2id[w]] = 1468    469        # Misc attributes470        self.p = mask_prob471        self.length = len(self.text)472    def __len__(self):473        '''474        Length attribute475        '''476        return self.length477    def __getitem__(self, idx):478        '''479        Return items from dataset for dataloader480        '''481        seq = self.encoded_text[idx]482        attn_mask = self.attention_masks[idx]483        labels = self.labels[idx]484        noisy_labels = self.noisy_labels[idx]485        noised_ids, mlm_labels = self.noise_input_tokens(seq)...

parameter_test.py

Source:parameter_test.py

...68            start69                = {rule_positional | rule_keywords | rule_all} $ ;70            rule_positional('ABC', 123, '=', '+')71                = 'a' ;72            rule_keywords(k1=ABC, k3='=', k4='+', k2=123)73                = 'b' ;74            rule_all('DEF', 456, '=', '+', k1=HIJ, k3='=', k4='+', k2=789)75                = 'c' ;76        '''77        pretty = '''78            @@ignorecase :: False79            @@nameguard :: True80            start81                =82                {rule_positional | rule_keywords | rule_all} $83                ;84            rule_positional(ABC, 123, '=', '+')85                =86                'a'87                ;88            rule_keywords(k1=ABC, k3='=', k4='+', k2=123)89                =90                'b'91                ;92            rule_all(DEF, 456, '=', '+', k1=HIJ, k3='=', k4='+', k2=789)93                =94                'c'95                ;96        '''97        model = compile(grammar, 'RuleArguments')98        self.assertEqual(trim(pretty), str(model))99        model = compile(pretty, 'RuleArguments')100        ast = model.parse("a b c")101        self.assertEqual(['a', 'b', 'c'], ast)102        semantics = TC36Semantics()103        ast = model.parse("a b c", semantics=semantics)104        self.assertEqual(['a', 'b', 'c'], ast)105        codegen(model)106    def test_36_unichars(self):107        grammar = '''108            start = { rule_positional | rule_keywords | rule_all }* $ ;109            rule_positional("ÃÃÃÃ¤Ã¶Ã¼Ã") = 'a' ;110            rule_keywords(k1='Ã¤Ã¶Ã¼ÃÃÃÃ') = 'b' ;111            rule_all('ÃÃÃÃÃ¤Ã¶Ã¼', k1="ÃÃ¤Ã¶Ã¼ÃÃÃ") = 'c' ;112        '''113        def _trydelete(pymodule):114            import os115            try:116                os.unlink(pymodule + ".py")117            except EnvironmentError:118                pass119            try:120                os.unlink(pymodule + ".pyc")121            except EnvironmentError:122                pass123            try:124                os.unlink(pymodule + ".pyo")...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.