Best Python code snippet using lisa_python
athens_scrapping.py
Source:athens_scrapping.py  
1import requests, re, pandas as pd2from sqlalchemy import create_engine3from dateparser.search import search_dates4from bs4 import BeautifulSoup5import camelot, pandas as pd6import numpy as np7from pandas import *8from scraping.models import *9def html_parser(url):10    '''Gets a URL to return the html parsed code to be used on a soup variable'''11    try:12        page = requests.get(url)13    except requests.exceptions.Timeout:14        #TODO: try again in 30 minutes (add trigger)15        pass16    except requests.exceptions.HTTPError as err:17        raise SystemExit(err)18    soup = BeautifulSoup(page.content, "html.parser")19    #returns HTML code of the website20    return soup21def get_pdf_list(parsed_url):22    '''Gets list of pdfs of the last 5 days of pdfs from parsed html code'''23    soup = html_parser(parsed_url)24    #Get list of URLs including pdf files25    pdf_docs = soup.find_all('a', {'title': re.compile(r'.*\.pdf')})26    #Get list of titles and urls. NOTE: ENTER THE NUMBER OF DOCS27    try:28        pdf_json = []29        for pdf_doc in pdf_docs[:3]:30            pdf_title = pdf_doc.find(text=True, recursive = False).strip()31            pdf_url = parsed_url+pdf_doc.get('href').strip()32            onhold_date = search_dates(pdf_title, languages=['el'])[0][1]33            #Add to json item34            item = {"pdf_title": pdf_title, "pdf_url": pdf_url, "onhold_date":onhold_date}35            pdf_json.append(item)36    except:37        print(f"Error s101 {pdf_title}")38        raise Exception(f"Couldn't build json file with urls, title: {pdf_title} and onhold date")39    print("Page parsed and json has been built with doc title, url and onhold date")40    return pdf_json41'''TODO: Change to be secure'''42def pass_to_database(username, password, server, port, database, dataframe, table):43    engine = create_engine(f'postgresql://{username}:{password}@{server}:{port}/{database}', encoding='utf-8-sig')44    dataframe.to_sql(table, engine, if_exists='replace')45def runcsript():46    #Define the URL that you want to scrape:47    athens_hospitals_url = 'https://www.moh.gov.gr/articles/citizen/efhmeries-nosokomeiwn/68-efhmeries-nosokomeiwn-attikhs'48    pdf_json_info = get_pdf_list(athens_hospitals_url) #json data of hospitals49    '''Start of cleanup process50    for i, json_item in enumerate(pdf_json_info):51        if "ÎΡÎÎ ÎΠÎÎÎÎÎÎÎÎΠÎÎÎΣÎ" in json_item['pdf_title'] and json_item['onhold_date'] == pdf_json_info[i+1]['onhold_date']:52            pdf_json_info.pop(i+1)53    '''54    tables_received = []55    for pdf_item in pdf_json_info:56        pdf_url = pdf_item["pdf_url"]57        pdf_title = pdf_item["pdf_title"]58        onhold_date = pdf_item["onhold_date"]59        print(f"Getting thought the item with title: {pdf_title}")60        try:61            #Read pdf62            tables = camelot.read_pdf(pdf_url, pages ='1-end')63            number_of_tables = tables.n64            num_of_columns = len(tables[0].df.columns)65            print(f"num of tables: {number_of_tables}, num of columns: {num_of_columns}")66        except:67            print(f"Error s102 {pdf_title}")68            raise Exception(f"Couldn't parse the pdf file with title: {pdf_title}")69        '''Process to concat tables'''70        try:71            all_tables = []72            for i in range(number_of_tables):73                table = tables[i].df74                all_tables.append(table)75            concat_pdf_tables = pd.concat(all_tables, axis=0, ignore_index=True)76            concat_pdf_tables.rename(columns={concat_pdf_tables.columns[0]: "clinic" }, inplace = True) #Set first column to Clinic77            start_new_table_from = concat_pdf_tables.loc[concat_pdf_tables['clinic'].str.contains("ÎÎÎÎÎÎÎΣ|ÎλινικÎÏ", case=False)].first_valid_index() #Returns the first id 78            concat_pdf_tables = concat_pdf_tables.iloc[start_new_table_from:].reset_index(drop=True)79            new_header = concat_pdf_tables.iloc[0] #grab the first row for the header80            concat_pdf_tables = concat_pdf_tables[1:] #take the data less the header row81            concat_pdf_tables.columns = new_header #set the header row as the df header82            concat_pdf_tables.reset_index(drop=True)83            concat_pdf_tables.rename(columns={concat_pdf_tables.columns[0]: "clinic" }, inplace = True) #Set first column to Clinic84            search = concat_pdf_tables.loc[concat_pdf_tables['clinic'].str.contains("ÎÎÎÎÎÎÎΣ|ÎλινικÎÏ", case=False)] #Find and remove all the clinic rows85            final_results = concat_pdf_tables.drop(search.index.values).reset_index(drop=True)86            df_unpivoted = final_results.melt(id_vars=['clinic', 'ΠÎΡÎΤÎΡÎΣÎÎΣ'], var_name='onhold_time', value_name='hospital_names')87            df_unpivoted['hospital_names'].replace('', np.nan, inplace=True)88            df_unpivoted.dropna(subset=['hospital_names'], inplace=True)89            df_unpivoted = df_unpivoted.reset_index(drop=True)90            cleanup_process = df_unpivoted.rename(columns={"Clinic":"clinic", "ΠÎΡÎΤÎΡÎΣÎÎΣ":"note", "onhold_time":"onhold_hour", "hospital_names":"hospital_name"})91            cleanup_process.head()92            #cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'ΠÎÎΡÎÎÎΣ\n',  'ΠÎÎΡÎÎÎΣ ', regex=True) #remove this if needed93            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\nΠÎÎΡÎÎÎΣ \n',  ', ΠÎÎΡÎÎÎΣ ', regex=True)94            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\nΠÎÎΡÎÎÎΣ\n',  ' ΠÎÎΡÎÎÎΣ ', regex=True)95            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'ΠÎÎΡÎÎÎΣ \n',  'ΠÎÎΡÎÎÎΣ ', regex=True)96            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'Î. \nÎÎÎÎÎÎÎΤÎΣ',  'Î. ÎÎÎÎÎÎÎΤÎΣ', regex=True)97            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'Î \nΠÎÎÎÎÎÎΡÎΣΤÎΣ',  'ΠΠÎÎÎÎÎÎΡÎΣΤÎΣ', regex=True)98            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'ÎÎ. \nΠÎÎΤÎÎÎÎÎΩÎ',  'ÎÎ. ΠÎÎΤÎÎÎÎÎΩÎ', regex=True)99            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'ÎÎÎÎÎ \nÎÎÎΡÎΥΡÎÎ',  'ÎÎÎÎÎ ÎÎÎΡÎΥΡÎÎ', regex=True)100            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'ÎÎÎÎΣ \nΣÎÎÎÎΣ',  'ÎÎÎÎΣ ΣÎÎÎÎΣ', regex=True)101            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\nÎ.',  ', Î.', regex=True)102            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\nΠ.',  ', Π.', regex=True)103            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\nÎ.',  ', Î.', regex=True)104            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\nΨ.',  ', Ψ.', regex=True)105            cleanup_process = cleanup_process.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n',  ' ', regex=True)106            separate_data = cleanup_process['hospital_name'].str.split(',').apply(Series, 1).stack()107            separate_data.index = separate_data.index.droplevel(-1)108            separate_data.name = 'hospital_name'109            del cleanup_process['hospital_name']110            final_data = cleanup_process.join(separate_data)111            final_data['onhold_date'] = onhold_date112            final_data['region'] = "Athens"113            final_data['hospital_name'] = final_data['hospital_name'].str.strip()114            final_data = final_data[final_data.hospital_name != 'ΠÎÎΡÎÎÎΣ']115            final_data = final_data.reset_index(drop=True)116            tables_received.append(final_data)117            print(f"Table for day: {onhold_date} and pdf {pdf_title} was added")118        except:119            print(f"Error s103 {table}")120            raise Exception(f"Couldn't build the table with na: {table}")121    df_merge = pd.concat(tables_received)122    df_merge.reset_index(drop=True)123    # Not able to iterate directly over the DataFrame124    df_records = df_merge.to_dict('region')125    return df_records126if __name__ == '__main__':127    # test1.py executed as script128    # do something...main.py
Source:main.py  
1from subprocess import Popen, PIPE2import atexit3import os4agent_processes = [None, None]5def cleanup_process():6    global agent_processes7    for proc in agent_processes:8        if proc is not None:9            proc.kill()10def js_agent(observation, configuration):11    """12    a wrapper around a js agent13    """14    global agent_processes15    agent_process = agent_processes[observation.player]16    ### Do not edit ###17    if agent_process is None:18        cwd = os.path.dirname(configuration["__raw_path__"])19        agent_process = Popen(["node", "dist/main.js"], stdin=PIPE, stdout=PIPE, cwd=cwd)...Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!
