How to use getFirstContent method in unittest-xml-reporting

Best Python code snippet using unittest-xml-reporting_python

scrapy_b.py

Source:scrapy_b.py Github

copy

Full Screen

...33 return soup343536# 获取番剧的链接list37def getFirstContent(soup):38 # print(content)39 # soup = BeautifulSoup(content, "html.parser")40 # 搜索的页面出来得到视频部分信息41 next_urls = []42 infos = soup.find_all('a','bangumi-title')43 for info in infos:44 next_urls.append(info['href'].strip())45 # print(len(infos))4647 return next_urls484950# 获取番剧的一些信息51def getDetail(path,fname_detail):52 links_ = pd.read_csv(path)53 links = links_.drop_duplicates() # 可能有重复的需要去重54 urls = links['links']55 cont_id = 056 print("start!")57 v_ids = [] # id58 titles = [] # 标题59 genres = [] # 类型60 years = [] # 年份61 long_comms = [] # 长评论数62 short_comms = [] # 短评论数63 detail_link = [] # 当前页面链接64 for url2 in tqdm(urls):65 try:66 soup1 = get_soup(r'http:' + url2)67 next_link = soup1.find('a', 'media-title')['href']68 soup2 = get_soup(r'http:' + next_link + r'#long') # 长评页面6970 '''71 soup2.find('div', 'media-tab-nav').find('ul').find_all('li'):72 [<li class="">作品详情</li>,73 <li class="on">长评 ( 572 )</li>,74 <li class="">短评 ( 117867 )</li>,75 <li class="">相关视频</li>]76 '''77 # 评分数, '长评 ( 572 )' 取数字572,变为int,没有评论等信息的不需要,进行跳过78 long = int(soup2.find('div', 'media-tab-nav').find('ul').find_all('li')[1].string[5:-2])79 short = int(soup2.find('div', 'media-tab-nav').find('ul').find_all('li')[2].string[5:-2])80 long_comms.append(long)81 short_comms.append(short)82 # 取标题83 title = soup2.find('span', 'media-info-title-t').string84 titles.append(title)85 # 取标签86 tags = ''87 for tag in soup2.find('span', 'media-tags').children:88 tags = tags + str(tag.string) + ',' # tags='漫画改,战斗,热血,声控,'89 genres.append(tags)90 # 截取年份:'2019年4月7日开播'91 year = soup2.find('div','media-info-time').span.string[:-2]92 years.append(year)9394 # 增加id的95 v_ids.append(soup1.find('a','av-link').string)96 cont_id += 197 # v_ids.append(cont_id)98 # 获取当前页面链接99 detail_link.append(r'http:' + next_link)100101 # soup2.find('div','review-list-wrp type-long').find('ul').contents102 if cont_id % 10 == 0:103 print('已爬取%d条' % cont_id)104 # 每5条写入一次,防止中断导致数据丢失105 if cont_id % 5 == 0:106 # 写入107 Data_detail = {'v_id': v_ids, 'title': titles, 'genres': genres, 'year': years,108 'long_comm': long_comms,109 'short_comm': short_comms, 'detail_link': detail_link}110 wirte2csv(Data_detail, fname_detail)111 # 清空112 v_ids = [] # id113 titles = [] # 标题114 genres = [] # 类型115 years = [] # 年份116 long_comms = [] # 长评论数117 short_comms = [] # 短评论数118 detail_link = [] # 当前页面链接119 time.sleep(5)120121 except Exception:122 pass123 return124125126# 获取番剧的相关推荐127def getRecommond(path,fname_detail):128 detail_data = pd.read_csv(path)129 detail_data_ = detail_data.drop_duplicates() # 可能有重复的需要去重130 urls = detail_data_['detail_link']131 cont_id = 0132 print("start!")133 v_ids = [] # id134 rec_id = [] # 推荐id135 rec_title = [] # 推荐名字136 for url2 in tqdm(urls):137 try:138 soup1 = get_soup(url2)139 # 增加count140 cont_id += 1141142 v_ids.append(detail_data_.loc[cont_id,'v_id'])143 # 获取推荐番剧的title144 tmp_title = []145 for title in soup1.find_all('div','slide-item-title'):146 tmp_title.append(title.string)147 rec_title.append(tmp_title)148 # 获取推荐番剧的link149 rec_links = []150 for l in soup1.find_all('div','slide-item-info'):151 rec_links.append(l.find('a')['href'])152 # 获取推荐番剧的id153 tmp_id = []154 for link in rec_links:155 soup2 = get_soup(r'http:'+link)156 tmp_id.append(soup2.find('a', 'av-link').string)157158 rec_id.append(tmp_id)159160 if cont_id % 10 == 0:161 print('已爬取%d条' % cont_id)162163 # 每5条写入一次,防止中断导致数据丢失164 if cont_id % 5 == 0:165 # 写入166 Data_detail = {'v_id': v_ids, 'rec_id': rec_id,'rec_title':rec_title}167 wirte2csv(Data_detail, fname_detail)168 # 清空169 v_ids = [] # id170 rec_id = [] # 推荐id171 rec_title = [] # 推荐名字172173 time.sleep(rand_seconds)174175 except Exception:176 pass177 return178179180181def process_time(rat_time):182 # 2020-05-07 len = 10183 if len(rat_time) == 10:184 return rat_time185 else:186 if len(re.findall(r'^\d+小时前$', rat_time)):187 return (datetime.datetime.now() - datetime.timedelta(hours=int(rat_time[:-3]))).strftime("%Y-%m-%d")188189 elif len(re.findall(r'^\d+分钟前$', rat_time)):190 return (datetime.datetime.now()).strftime("%Y-%m-%d")191192 elif rat_time == '昨天':193 return (datetime.datetime.now()-datetime.timedelta(days=1)).strftime("%Y-%m-%d")194195 elif len(rat_time) == 5: # 如果没有年份196 return str(datetime.datetime.now().year) + '-' + rat_time197198199# 滚动获取评论信息的方法200def get_rating(url,page_num):201 # 获取网页源代码202 driver.get(url)203 # driver.get(url + r'#long')204 # page_num = long_page_num205 id_names = []206 ratings = []207 rating_times = []208 # 循环几次 滚动几次209 for i in range(page_num):210 # 让浏览器执行简单的js代码,document.body.scrollHeight:窗口高度211 js = "window.scrollTo(0,document.body.scrollHeight)"212 driver.execute_script(js)213 time.sleep(rand_seconds)214 # 具体看网页是怎么样的,b站是滑动到哪里,上面的都会加载进来,因此选取滑动之后的网页215 if i == page_num-1:216 # 获取页面217 content = driver.page_source218 # 放入解析219 soup = BeautifulSoup(content, 'lxml')220 # 找到这页id221 for li in soup.find_all('li','clearfix'):222 id_names.append(li.find('div',re.compile('review-author-name')).string.strip())223 rat = len(li.find_all('i', 'icon-star icon-star-light')) # 评分224 ratings.append(rat)225226 rat_time = li.find('div', 'review-author-time').string227 # 对特殊时间做处理228 rat_time_2 = process_time(rat_time)229 rating_times.append(str(rat_time_2))230231 return id_names,ratings,rating_times232233234# 获取rating,相关信息,并存入csv235def get_rating_data(path):236 detail = pd.read_csv(path)237 # print(min(detail['short_comm']+detail['long_comm'])) # 230;238 # print(detail.columns) # ['v_id', 'title', 'genres', 'year', 'long_comm', 'short_comm','detail_link']239 minn = min(detail['short_comm'] + detail['long_comm'])240 rating_links = detail['detail_link']241 long_num = detail['long_comm']242 short_num = detail['short_comm']243 v_ids = detail['v_id']244 for ind, url in enumerate(tqdm(rating_links)):245 # print(ind,url)246 # if ind< 425:247 # continue248 # 按比例取长短评价249 # print(v_ids[61])250 # 因为长评论会比短评论少,可能用单纯的通过最小总评论数进行比例计算,可能会出现长评数不够,最终会有一些数据的丢失,为此这里进行最小的比较251 lon = min(int((long_num[ind] / (long_num[ind] + short_num[ind])) * minn),long_num[ind])252 sho = minn - lon253254 long_page_num = math.ceil(lon / 20) # 一页20个数据,看需要滑动几页255 short_page_num = math.ceil(sho / 20) # 一页20个数据,看需要滑动几页256257 id_l, rat_l,time_l = get_rating(url + r'#long', long_page_num)258 id_s, rat_s,time_s = get_rating(url + r"#short", short_page_num)259 # print(len(id_l))260 # print(len(id_s))261262 # 需要把之前的长短评价各自分配的数目取到263 id_total = id_l[0:lon]+id_s[0:sho]264 rat_total = rat_l[0:lon]+rat_s[0:sho]265 rating_time_total = time_l[0:lon]+time_s[0:sho]266 # print(len(id_total))267 # print(len(rat_total))268269 # 封装到DataFrame270 Data_rating = {'user_id_name': id_total,'v_id':[v_ids[ind]]*minn,'rating':rat_total,'rating_time':rating_time_total}271 # print(Data_rating)272 fname_rating = "rating_data.csv"273 wirte2csv(Data_rating, fname_rating)274 return275276277# 写入csv278def wirte2csv(Data,fname):279 try:280 if os.path.exists(fname):281 DataFrame = pd.DataFrame(Data)282 DataFrame.to_csv(fname, index=False, sep=',', mode='a', header=False)283 print('追加成功!')284 else:285 DataFrame = pd.DataFrame(Data)286 DataFrame.to_csv(fname, index=False, sep=',')287 print('save!')288 except:289 print('fail')290291292if __name__ == '__main__':293 flag1 = 0 # 要不要爬取番剧列表页294 flag2 = 0 # 要不要爬取番剧信息295 flag3 = 0 # 要不要爬取评分296 flag4 = 1 # 要不要爬取相关推荐297 if flag1:298 # step1299 for i in tqdm(range(21)):300 # 从0开始的原因是,对于第一次访问的页面会连续访问两次,导致重复爬取,所以i=0时获取页面,但是不去存入信息301 # 剧番页面,从1-20页302 url = 'https://www.bilibili.com/anime/index/#season_version=-1&area=-1' \303 '&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1' \304 '&style_id=-1&order=3&st=1&sort=0&page='+str(i+1)305 # 刷新,重要!!!否则可能会导致重复爬取第一个页面306 driver.refresh()307 # print(url)308 soup = get_soup(url)309 if i == 0:310 continue311 #driver.find_element_by_class_name('p next-page').click()312 next_urls = getFirstContent(soup)313 print(next_urls)314 # 写入csv315 Data_link = {'links': next_urls}316 fname_link = "link_data.csv"317 wirte2csv(Data_link, fname_link)318 print('爬到第%d页' % i)319 # 暂停320 time.sleep(5)321 if flag2:322 # step2323 path = r'D:\Learning\postgraduate\bilibili\scrapy_py\link_data.csv'324 # 爬取细节并存入新的csv325 getDetail(path,fname_detail = "video_data.csv")326 if flag3: ...

Full Screen

Full Screen

wikiCrawler.py

Source:wikiCrawler.py Github

copy

Full Screen

...7from nltk import pos_tag8from mtranslate import translate9remove_words = ["\\n", "\"", "\\t", "[", "]", "‘", "’", "·"]10stop_words = set(stopwords.words('english')) 11def getFirstContent(soup):12 p_sentences = list()13 div = soup.find('div', {'class': 'mw-parser-output'})14 if (div is None):15 return ''16 children = div.findChildren(recursive=False)17 for child in children: 18 if (child.name == 'h2' or child.name == 'h3'):19 break20 if (child.name == 'p'):21 if (child.text == "\\n\\n"):22 break23 tags_to_delete = child.findAll('sup')24 if (tags_to_delete is not None):25 for tg in tags_to_delete:26 tg.extract()27 articleText = child.get_text(" ").replace(u'\xa0', u' ')28 articleText = articleText.replace("\\'", "'")29 for word in remove_words:30 if word in articleText:31 articleText = articleText.replace(word, u"")32 # remove the characters between the parentheses and brackets33 articleText = re.sub("[\(\[].*?[\)\]]", "", articleText)34 # remove multi-spaces35 articleText = re.sub(" +", " ", articleText)36 sentences = list(map(str.strip, re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", articleText)))37 for each_sentence in sentences:38 if (len(each_sentence) >= 10):39 p_sentences.append(each_sentence)40 return p_sentences41def getSentences(response, outfile):42 doc = Document(response.text)43 content = Document(doc.content()).summary()44 soup = BeautifulSoup(content, "html.parser")45 delete_tags = ['figure']46 for tag in delete_tags:47 tags_to_delete = soup.findAll(tag)48 if (tags_to_delete is not None):49 for tg in tags_to_delete:50 tg.extract()51 tags_to_delete = soup.findAll('p', text="\\n")52 if (tags_to_delete is not None):53 for tg in tags_to_delete:54 tg.extract()55 tags_to_delete = soup.findAll('p', {"class": "shortdescription"})56 if (tags_to_delete is not None):57 for tg in tags_to_delete:58 tg.extract()59 p_sentences = getFirstContent(soup)60 if (p_sentences == ''):61 return False62 for sen in p_sentences:63 outfile.write(sen + "\n")64 outfile.close()65 return True66def getContentOnWiki(link, rec=True):67 visitedUrlFile = "visited_urls.txt"68 try:69 fileUrls = open(visitedUrlFile, 'r', encoding='utf-8')70 except IOError:71 visitedUrls = []72 else:73 visitedUrls = [url.strip() for url in fileUrls.readlines()]...

Full Screen

Full Screen

train.py

Source:train.py Github

copy

Full Screen

...10from sklearn.linear_model import LogisticRegression11from sklearn.externals import joblib12import os13import sys14def getFirstContent(dataUrl, modelUrl, modelName):15 training_data = load_files(dataUrl, encoding="utf-8")16 '''17 这是开始提取特征,这里的特征是词频统计。18 '''19 count_vect = CountVectorizer()20 X_train_counts = count_vect.fit_transform(training_data.data)21 '''22 这是开始提取特征,这里的特征是TFIDF特征。23 '''24 tfidf_transformer = TfidfTransformer()25 X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)26 '''27 使用朴素贝叶斯分类,并做出简单的预测28 '''29 mnb_pipeline = PMMLPipeline([("classifier", LogisticRegression())])30 mnb_pipeline.fit(X_train_tfidf, training_data.target)31 32 //保存为pkl格式33 joblib.dump(mnb_pipeline, modelUrl + modelName)34 //保存为pmml格式35 sklearn2pmml(mnb_pipeline, modelUrl + modelName, with_repr = True)36 if (os.path.exists(modelUrl + modelName)):37 return "success"38 else:39 return "fail"40if __name__ == '__main__':41 a = []42 for i in range(1, len(sys.argv)):43 a.append((str(sys.argv[i])))...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run unittest-xml-reporting automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful