Unlock 30% off on Manual Testing Annual Plans this Holiday Season.

Copied to Clipboard!

How to use visible_texts method in Selene

Best Python code snippet using selene_python

scrape_from_url.py

Source:scrape_from_url.py

1# -*- coding: utf-8 -*-2'This module is used to scrape all texts from classified url'3from bs4 import BeautifulSoup, Comment4from fake_useragent import UserAgent5from urllib.request import urlopen6from urllib.error import URLError7from typing import List, Tuple8from nltk import sent_tokenize9from _socket import gaierror10from pathlib import Path11from tqdm import tqdm12import pandas as pd13import urllib14import pickle15import bs416import re17import os18UNIVERSAL_ENCODING = "utf-8"19def tag_visible(element: bs4.element.ResultSet) -> bool:20    """Filter tags in html21    22    Return False for those invisible contents' tag.23    Return True for visible contents' tag.24    Args:25        element: an bs4.element instance waiting to be filtered.26    Returns:27        False for invisible contents' tag.28        True for visible contents' tag.29    """30    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:31        return False32    if isinstance(element, Comment):33        return False34    return True35def crawl(URL: List[str]) -> Tuple[List[str], List[str]]:36    """Crawl corpus from classified URLs37    Args:38        URL: a list of URL string waiting to be scraped.39    Returns:40        contents: a list of string contents scrapepd from given URLs.41        valid_URL: a list of URL have been scraped and reserve for the convenience of a side-by-side annotating.42    """43    valid_URL = []44    contents = []45    for index, url in enumerate(tqdm(URL)):46        request = urllib.request.Request(url, headers={'User-Agent': UserAgent().random})47        try:48            html = urlopen(request, timeout=10).read().decode('utf-8')49        except gaierror as e:50            print(index, e, url)51            continue52        except URLError as e:53            print(index, e, url)54            continue55        except:56            print("Something else went wrong with", url, "\n")57        soup = BeautifulSoup(html, features='lxml')58        texts = soup.findAll(text=True)59        # Format and clean corpus.60        visible_texts = filter(tag_visible, texts)61        visible_texts = "".join(text for text in visible_texts)62        visible_texts = re.sub(r"(\r)+", "\r", visible_texts)63        visible_texts = re.sub(r"(\n)+", "\n", visible_texts)64        visible_texts = re.sub(r"(\r\n)+", "\n", visible_texts)65        visible_texts = re.sub(r"(\r)+", "\r", visible_texts)66        visible_texts = re.sub(r"(\n)+", "\n", visible_texts)67        visible_texts = re.sub(r"\n(\s)+", "\n", visible_texts)68        visible_texts = re.sub(r"\s\n(\s)*", "\n", visible_texts)69        visible_texts = re.sub(r"\n(\W)+\n", "\n", visible_texts)70        visible_texts = re.sub(r"^(\s)+", "", visible_texts)71        visible_texts = re.sub(r"(\s)+$", "", visible_texts)72        visible_texts = re.sub(r"\. ", ".\n", visible_texts)73        visible_texts = re.sub(r"\w(\. )\w", ".\n", visible_texts)74        sentences = sent_tokenize(visible_texts)75        visible_texts = "\n".join(sentence for sentence in sentences)76        if visible_texts:77            valid_URL.append(url)78            contents.append(visible_texts)79        del visible_texts80    assert len(contents) == len(valid_URL)81    return contents, valid_URL82    83if __name__ == '__main__':84    corpus_folder = Path("../Data/Corpus2/")85    url_folder = Path("../Course_Collected/")86    websites = pd.read_csv(url_folder / "Final.csv")87    contents, scraped_URL = crawl(websites.URL)88    pickle.dump(contents, open(corpus_folder / "content.p", "wb"))89    pickle.dump(scraped_URL, open(corpus_folder / "url.p", "wb"))90    contents = pickle.load(open(corpus_folder / "content.p", "rb"))91    scraped_URL = pickle.load(open(corpus_folder / "url.p", "rb"))92    93    # Save corpus to file94    corpus_index = 095    for content in tqdm(contents):96        file_name = str(corpus_index) + ".txt"97        full_file_path = corpus_folder / file_name98        with open(full_file_path, "w", encoding=UNIVERSAL_ENCODING) as file:99            file.write(content)100        corpus_index += 1101    # Save url list102    url_file_name = "url.txt"103    url_full_file_path = corpus_folder / url_file_name104    file = open(url_full_file_path, "w", encoding=UNIVERSAL_ENCODING)105    for key, value in tqdm(enumerate(scraped_URL)):106        if key != len(scraped_URL) - 1:107            file.write(value + '\n')108        else:109            file.write(value)...

webscrap1.py

Source:webscrap1.py

1import bs4 as bs2from bs4.element import Comment3import urllib.request4import re5import string6from nltk.tokenize import word_tokenize7from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS8#from nltk.stem import WordNetLemmatizer9from heapq import nlargest10import ssl11from time import time12import userAgents13def tag_visible(element):14    if element.parent.name in ['style', 'script', 'head', '[document]']:15        return False16    if isinstance(element, Comment):17        return False18    return True19def text_from_html(body):20    print("in text_from_html")21    soup = bs.BeautifulSoup(body, 'html.parser')22    texts = soup.findAll(text=True)23    visible_texts = filter(tag_visible, texts)24    #print(texts)25    return visible_texts26    #return u" ".join(t.strip() for t in visible_texts)27ctx = ssl.create_default_context()28ctx.check_hostname = False29ctx.verify_mode = ssl.CERT_NONE30ourStopWords = set({"click","view","more","link","forgot","password"})31       32print("imported")33#startTime = time()34def getSiteKeywords(url):35    html=''36    count=037    while(True):38        try:39            req = urllib.request.Request(url,data=None,headers={'User-Agent': userAgents.getRandomUserAgent()})40            html = urllib.request.urlopen(req, context=ctx).read()41            #html = urllib.request.urlopen(url, context = ctx).read()42            break43        #sauce = urllib.request.urlopen(url).read()44        #soup = bs.BeautifulSoup(sauce,'lxml')45        except Exception as e:46            print(e,url)47            if(count==2):48                return []49            count+=150    #print(soup.get_text())51    #print(soup)52    #return None53    #tags = ['a','h1','h2','h3','meta','title','p','div']54    #keywords = []55    #for i in tags :56        #keywords += list(soup.find_all(i))57    58    d = dict()59    #lemmatizer=WordNetLemmatizer()60    removeSpcCharPattern=re.compile('[\W_]+')61    #print(type(html))62    #count=063    for data in text_from_html(html):64        if(len(data)<=2):65            continue66        #print(data,len(data))67        text = data.lower().strip()68        text= removeSpcCharPattern.sub(' ',text)69        text = text.strip()70        text = re.sub(r'\d+', '', text)71        text = re.sub(r"[^\w\s]","",text)72        #text = text.translate(string.maketrans("","", string.punctuation))73        #print(text)74        tokens = word_tokenize(text)75        for i in tokens:76            if not i in ENGLISH_STOP_WORDS | ourStopWords:77                #result = lemmatizer.lemmatize(i)78                result = i79                if(len(result)<=2):80                    continue81                if result not in d:82                    d[result] =  183                else:84                    d[result] += 185        #if(count==20):86            #break87        #count+=188    89    largest50 = nlargest(min(50,len(d)),d, key = d.get)90    return largest5091#print(getSiteKeywords("https://www.itlearn360.com"))92#endTime = time()93#print(endTime - startTime)94"""95from bs4 import BeautifulSoup96from bs4.element import Comment97import urllib.request98import ssl99print("imported")100def tag_visible(element):101    if element.parent.name in ['style', 'script', 'head', '[document]']:102        return False103    if isinstance(element, Comment):104        return False105    return True106def text_from_html(body):107    print("in text_from_html")108    soup = BeautifulSoup(body, 'html.parser')109    texts = soup.findAll(text=True)110    visible_texts = filter(tag_visible, texts)111    #print(texts)112    return u" ".join(t.strip() for t in visible_texts)113url = "https://expired.badssl.com"114#req = urllib.request(url)115#gcontext = ssl.SSLContext()  # Only for gangstars116print("no error")117ctx = ssl.create_default_context()118ctx.check_hostname = False119ctx.verify_mode = ssl.CERT_NONE120html = urllib.request.urlopen(url, context = ctx).read()121#html = urllib.urlopen(req, context=gcontext).read()122print(text_from_html(html))...

download10000files.py

Source:download10000files.py

1import mechanize2import cookielib3import time4import os5import json6from sets import Set7import lxml.html8import codecs9import re10from bs4 import BeautifulSoup11import textwrap12# with codecs.open(file name,'r',encoding='utf8') as f:13#     text = f.read()14# # process Unicode text15def strStr(haystack, needle):16        if len(haystack) < len(needle): return -117        i = 018        while i < len(haystack)-len(needle)+1:19            j = 0; k = i20            while j < len(needle):21                if haystack[k] == needle[j]:22                    j+=1; k+=123                else:24                    break25            if j == len(needle):26                break27            else:28                i+=129        if i == len(haystack)-len(needle)+1:30            return -131        else:32            return i33def remove(visible_texts, needle):34    buffer = ''35    index = -136    index = strStr(visible_texts, needle)37    if index != -1:38        buffer += visible_texts[:index]39        buffer += visible_texts[index + len(needle):]40    else:41        buffer = visible_texts42    return buffer43def visible(element):44    if element.parent.name in ['style', 'script', '[document]', 'head', 'title', 'link', 'a']:45        return False46    return True47def filteralpha(str):48    str = re.sub(r'([^\s\w]|_)+', '', str)49    return " ".join(str.split(' '))50path = os.path.abspath("/Users/Xiaomin/testproject/tutorial/uniquename.txt")51urlfile=open(path, 'r')52listOfUrl = urlfile.read().split('\n')53print(len(listOfUrl))54i = 965955from sys import path56c = os.getcwd()57os.chdir('/Users/Xiaomin/cs410hw2')58for url in listOfUrl:59    cj = cookielib.LWPCookieJar()60    br = mechanize.Browser()61    br.set_cookiejar(cj)62    br.set_handle_equiv(True)63    br.set_handle_gzip(True)64    br.set_handle_redirect(True)65    br.set_handle_referer(True)66    br.set_handle_robots(False)67    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)68    br.set_debug_http(True)69    br.set_debug_redirects(True)70    br.set_debug_responses(True)71    br.open(url)72    st = br.response().read()73    #filename = url['name'].split('?q=')[1].split('&btn')[0]74    t = lxml.html.parse(url)75    title = t.find(".//title").text76    file = open('xxu46_'+ str(i) + '.html', 'wb')77    file.write(st)78    file.close()79    #nonjs = re.subn(r'<(script).*?   </\1>(?s)', '', str(st))[0]80    soup = BeautifulSoup(st)81    texts = soup.findAll(text=True)82    visible_texts = filter(visible, texts)83    visible_texts = ''.join(visible_texts)84    needle = '[if IE]><link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/site/consumer-navbar-ie-470687728._CB379390980_.css"><![endif]'85    visible_texts = remove(visible_texts, needle)86    needle = '<br>'87    visible_texts = remove(visible_texts, needle)88    needle = '<a href="/register/sharing">enable Facebook sharing</a>'89    visible_texts = remove(visible_texts, needle)90    visible_texts = filteralpha(visible_texts)91    with codecs.open('xxu46_' + str(i) + '.txt', 'w', encoding='utf8') as txt:92        txt.write(url+'\n')93        txt.write(title+'\n')94        txt.write(''.join(visible_texts))95        txt.close()96    i += 197#98#     import urllib99# >>> html = urllib.urlopen('http://www.nytimes.com/2009/12/21/us/21storm.html').read()100#101# >>>102# >>> soup = BeautifulSoup(html)103# >>> texts = soup.findAll(text=True)104# >>>105# >>> def visible(element):106# ...     if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:107# ...         return False108# ...     elif re.match('<!--.*-->', str(element)):109# ...         return False110# ...     return True111# ...112# >>> visible_texts = filter(visible, texts)...

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.