How to use JSONArray method in autotest

Best Python code snippet using autotest_python

FuJianSearcher-1.py

Source:FuJianSearcher-1.py Github

copy

Full Screen

1# coding=gbk2import PackageTool3import requests4import os5from PIL import Image6from bs4 import BeautifulSoup7import json8import re9from FuJianConfig import *10import datetime11from requests.exceptions import RequestException12import sys13import time14from gs import MSSQL15import random16import subprocess17# from Tables_dict import *18from gs.Searcher import Searcher19from gs.Searcher import get_args20from gs.KafkaAPI import KafkaAPI21import requests22requests.packages.urllib3.disable_warnings()23class FuJianSearcher(Searcher):24 search_result_json = None25 pattern = re.compile("\s")26 cur_mc = ''27 cur_code = ''28 json_result_data = []29 today = None30 # kafka = KafkaAPI("GSCrawlerTest")31 session_token = None32 cur_time = None33 verify_ip = None34 # save_tag_a = None35 load_func_dict = {}36 def __init__(self):37 super(FuJianSearcher, self).__init__(use_proxy=True)38 self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:47.0) Gecko/20100101 Firefox/47.0",39 "Host": "wsgs.fjaic.gov.cn",40 "Accept": "*/*", # ÓëÍøÕ¾ÉϵÄÇø±ð41 "Accept-Encoding": "gzip, deflate",42 "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",43 "Connection": "keep-alive",44 "Referer": "http://wsgs.fjaic.gov.cn/creditpub/home",45 "Upgrade-Insecure-Requests": "1", #Ϊɶ¼ÓÕâ¶Î46 "Content-type": "application/json"47 }48 # self.cur_time = '%d' % (time.time() * 1000)49 self.get_session_token()50 self.get_verify_ip()51 # self.json_result = {}52 self.set_config()53 time=datetime.datetime.now()54 self.time=time.strftime('%Y%m%d')55 self.load_func_dict[u'¶¯²úµÖѺµÇ¼ÇÐÅÏ¢'] = self.load_dongchandiyadengji56 self.load_func_dict[u'¶¯²úµÖѺÐÅÏ¢'] = self.load_dongchandiyadengji57 self.load_func_dict[u'¹ÉȨ³öÖʵǼÇÐÅÏ¢'] = self.load_guquanchuzhidengji58 self.load_func_dict[u'ÐÐÕþ´¦·£ÐÅÏ¢'] = self.load_xingzhengchufa59 self.load_func_dict[u'¾­ÓªÒì³£ÐÅÏ¢'] = self.load_jingyingyichang60 self.load_func_dict[u'ÑÏÖØÎ¥·¨ÐÅÏ¢'] = self.load_yanzhongweifa61 self.load_func_dict[u'ÑÏÖØÎ¥·¨Ê§ÐÅÐÅÏ¢'] = self.load_yanzhongweifa62 self.load_func_dict[u'³é²é¼ì²éÐÅÏ¢'] = self.load_chouchajiancha63 self.load_func_dict[u'»ù±¾ÐÅÏ¢'] = self.load_jiben64 self.load_func_dict[u'¹É¶«ÐÅÏ¢'] = self.load_gudong65 self.load_func_dict[u'·¢ÆðÈËÐÅÏ¢'] = self.load_gudong66 self.load_func_dict[u'±ä¸üÐÅÏ¢'] = self.load_biangeng67 self.load_func_dict[u'Ö÷ÒªÈËÔ±ÐÅÏ¢'] = self.load_zhuyaorenyuan68 self.load_func_dict[u'·ÖÖ§»ú¹¹ÐÅÏ¢'] = self.load_fenzhijigou69 self.load_func_dict[u'ÇåËãÐÅÏ¢'] = self.load_qingsuan70 self.load_func_dict[u'²Î¼Ó¾­ÓªµÄ¼ÒÍ¥³ÉÔ±ÐÕÃû'] = self.load_jiatingchengyuan # Modified by Jing71 self.load_func_dict[u'Ͷ×ÊÈËÐÅÏ¢'] = self.load_touziren #Modified by Jing72 self.load_func_dict[u'ºÏ»ïÈËÐÅÏ¢'] = self.load_hehuoren #Modified by Jing 73 self.load_func_dict[u'³ÉÔ±Ãû²á'] = self.load_chengyuanmingce #Modified by Jing74 self.load_func_dict[u'³·ÏúÐÅÏ¢'] = self.load_chexiao #Modified by Jing75 self.load_func_dict[u'Ö÷¹Ü²¿ÃÅ£¨³ö×ÊÈË£©ÐÅÏ¢'] = self.load_DICInfo #Modified by Jing76 def set_config(self):77 self.plugin_path = os.path.join(sys.path[0], '../fu_jian/ocr/type34.bat')78 self.group = 'Crawler' # Õýʽ79 self.kafka = KafkaAPI("GSCrawlerResult") # Õýʽ80 # self.group = 'CrawlerTest' # ²âÊÔ81 # self.kafka = KafkaAPI("GSCrawlerTest") # ²âÊÔ82 self.topic = 'GsSrc35'83 self.province = u'¸£½¨Ê¡'84 self.kafka.init_producer()85 def get_verify_ip(self):86 url = 'http://wsgs.fjaic.gov.cn/creditpub/security/verify_ip'87 r = self.post_request(url, timeout=20)#,verify=False88 self.verify_ip = r.text89 def get_verify_keyword(self, keyword):90 url = "http://wsgs.fjaic.gov.cn/creditpub/security/verify_keyword"91 params = {'keyword': keyword}92 r = self.post_request(url, params, timeout=20)#,verify=False93 return r.text94 def get_validate_image_save_path(self):95 return os.path.join(sys.path[0], '../temp/' + str(random.random())[2:] + '.png')96 def get_validate_file_path(self):97 return os.path.join(sys.path[0], '../temp/' + str(random.random())[2:] + '.txt')98 99 def recognize_yzm(self,validate_path,validate_result_path):100 cmd = self.plugin_path + " " + validate_path+ " " + validate_result_path101 # print cmd102 p=subprocess.Popen(cmd.encode('gbk','ignore'), stdout=subprocess.PIPE)103 p.communicate()104 fo = open(validate_result_path,'r')105 answer=fo.readline().strip()106 fo.close()107 print 'answer: '+answer.decode('gbk', 'ignore')108 os.remove(validate_path)109 os.remove(validate_result_path)110 return answer.decode('gbk', 'ignore')111 def get_yzm(self):112 params = {'ra': '%.15f' % random.random(), 'preset:': ''} # ÓëÍøÕ¾ÖÐÓк͹Øϵ£¿113 image_url = 'http://wsgs.fjaic.gov.cn/creditpub/captcha'114 r = self.get_request(image_url, params, timeout=20)#, verify=False115 # print r.headers116 yzm_path = self.get_validate_image_save_path()117 with open(yzm_path, 'wb') as f:118 for chunk in r.iter_content(chunk_size=1024):119 if chunk: # filter out keep-alive new chunks120 f.write(chunk)121 f.flush()122 f.close()123 yzm_file_path =self.get_validate_file_path()124 yzm = self.recognize_yzm(yzm_path,yzm_file_path) 125 return yzm 126 def get_session_token(self):127 r = self.get_request('http://wsgs.fjaic.gov.cn/creditpub/home', timeout=20)#,verify=False128 # print r.text129 idx_1 = r.text.index('session.token": "') + len('session.token": "')130 idx_2 = r.text.index('"', idx_1)131 self.session_token = r.text[idx_1:idx_2]132 def get_tag_a_from_page(self, keyword):133 tag_a = None134 for t in range(10):135 self.get_verify_keyword(keyword)136# self.today = str(datetime.date.today()).replace('-', '')137 yzm = self.get_yzm()138 url_1 = 'http://wsgs.fjaic.gov.cn/creditpub/security/verify_captcha'139 params_1 = {'captcha':yzm, 'session.token': self.session_token}140 r_1 = self.post_request(url=url_1, params=params_1, timeout=20) #, verify=False141 # print r_1, r_1.text142 if r_1.text != '0':143 url_2 = 'http://wsgs.fjaic.gov.cn/creditpub/search/ent_info_list'144 params_2 = {'captcha': yzm, 'condition.keyword': keyword, 'searchType': '1', 'session.token': self.session_token}145 r_2 = self.post_request(url=url_2, params=params_2, timeout=20) #, verify=False146 r_2.encoding = 'utf-8'147 if u'ÄúËÑË÷µÄÌõ¼þÎÞ²éѯ½á¹û' not in r_2.text:148 soup = BeautifulSoup(r_2.text, 'lxml')149 content=soup.find(class_='list-info')150 corp=content.find(class_='list-item')151 self.cur_mc = corp.find(class_='link').get_text().strip().replace('(', u'£¨').replace(')', u'£©')152 if keyword == self.cur_mc:153 self.cur_code = corp.find(class_='profile').span.get_text().strip()154 tag_a = corp.find(class_='link').a['href']155 break156 return tag_a157 def get_search_args(self, tag_a, keyword):158 if tag_a:159 return [tag_a]160 else:161 return []162 def parse_detail(self, args):163 page = args[0] # arg Ê×Ò³µÄimport ´Ógs.searcherÖÐÒýÈëµÄ164 r2 = self.post_request(page, timeout=20) #page ¼´ÊÇtag_a165 if u'¸ÃÊг¡Ö÷Ìå²»ÔÚ¹«Ê¾·¶Î§' not in r2.text:166 resdetail = BeautifulSoup(r2.text, 'lxml')167 print 'self.save_tag_a'168 print self.save_tag_a169 if not self.save_tag_a:170 li_list = resdetail.select('html body.layout div.main div.notice ul li') #ÏêÇéÒ³ Éϱê171 mc = li_list[0].text172 code = li_list[1].text173 # title_bar = resdetail.find(class_='title-bar clearfix')174 # if not title_bar:175 # print '************************************'176 # print r2.text177 # mc=title_bar.find('li')178 # code=mc.find_next('li')179 self.cur_mc=mc.strip()180 self.cur_code=code.strip()[13:] # Ç°13¸öΪ ¡¾×¢²áºÅ/ͳһÉç»áÐÅÓôúÂ룺¡¿181 print self.cur_mc,self.cur_code182 div_element_list = resdetail.find_all(class_='hide') #(style="display: none;") # »ÒÉ«µÄÒ²Äܲ鿴£¿´òÓ¡È·ÈÏ183 for div_element in div_element_list:184 table_element_list = div_element.find_all('table')185 for table_element in table_element_list:186 row_cnt=len(table_element.find_all("tr"))187 # print 'row_cnt',row_cnt188 table_desc = table_element.find("th").get_text().strip().split('\n')[0]189 if table_desc in self.load_func_dict:190 if table_desc == u'ÇåËãÐÅÏ¢':191 self.load_func_dict[table_desc](table_element)192 elif row_cnt > 3:193 self.load_func_dict[table_desc](table_element)194 else:195 raise Exception("unknown table!")196 else:197 print u'¸ÃÊг¡Ö÷Ìå²»ÔÚ¹«Ê¾·¶Î§'198 def load_jiben(self,table_element):199 jsonarray = []200 tr_element_list = table_element.find_all("tr")201 values = {}202 for tr_element in tr_element_list[1:]:203 th_element_list = tr_element.find_all('th')204 td_element_list = tr_element.find_all('td')205 if len(th_element_list) == len(td_element_list):206 col_nums = len(th_element_list)207 for i in range(col_nums):208 col_dec = th_element_list[i].get_text().strip().replace('\n','')209 col=jiben_column_dict[col_dec]210 val = td_element_list[i].get_text().strip().replace('\n','')211 if col != u'':212 values[col] = val213 if col == 'Registered_Info:registrationno':214 if len(val) == 18:215 values['Registered_Info:tyshxy_code'] = val216 else:217 values['Registered_Info:zch'] = val218# print col,val219 values['Registered_Info:province']=self.province220 values['rowkey']=self.cur_mc+'_01_'+self.cur_code+'_'221 jsonarray.append(values)222 self.json_result['Registered_Info']=jsonarray #ʲôÒâ˼ÄØ£¿223# json_jiben=json.dumps(jsonarray,ensure_ascii=False)224# print 'json_jiben',json_jiben225 226 227 def load_gudong(self, table_element):228 tr_element_list = table_element.find_all(class_="page-item")229 th_element_list = table_element.find_all('th')[1:-1]230 jsonarray = []231 values = {}232 id=1233 for tr_element in tr_element_list: 234 td_element_list = tr_element.find_all('td')235 col_nums = len(th_element_list)236 for i in range(col_nums):237 col_dec = th_element_list[i].get_text().strip().replace('\n','')238 col=gudong_column_dict[col_dec]239 td = td_element_list[i]240 val = td.get_text().strip()241 if val == u'ÏêÇé':242 link=td.a['href']243 detail_th_list = ['Shareholder_Info:subscripted_capital','Shareholder_Info:actualpaid_capital',244 'Shareholder_Info:subscripted_method','Shareholder_Info:subscripted_amount',245 'Shareholder_Info:subscripted_time','Shareholder_Info:actualpaid_method',246 'Shareholder_Info:actualpaid_amount','Shareholder_Info:actualpaid_time']247 r2 = self.get_request(link)248 resdetail = r2.text249 htmldetail = BeautifulSoup(resdetail,'html.parser') # ΪɶÄܼÓÔØjs£¿Ô­ÒòÊÇ£¿250 detail_content = htmldetail.find(class_="info m-bottom m-top")251 detail_tr_list = detail_content.find_all('tr')252 if len(detail_tr_list)>3:253 for tr_ele in detail_tr_list[3:]:254 td_ele_list = tr_ele.find_all('td')[1:]255 detail_col_nums = len(td_ele_list) 256 for m in range(detail_col_nums):257 col = detail_th_list[m]258 td=td_ele_list[m]259 val = td.text.strip()260 values[col] = val261# print col,val262 values[col] = link263 else:264 values[col] = val265 values['Shareholder_Info:registrationno']=self.cur_code266 values['Shareholder_Info:enterprisename']=self.cur_mc267 values['Shareholder_Info:id']=str(id)268 values['rowkey']=self.cur_mc+'_04_'+self.cur_code+'_'+self.time+str(id)269 jsonarray.append(values)270 values = {}271 id+=1 #ÈçºÎÓ³É䣿 ͨ¹ýÉÏÃæµÄvalues['Shareholder_Info:id']=str(id) À´Ó³Éä272 self.json_result['Shareholder_Info']=jsonarray273# json_gudong=json.dumps(jsonarray,ensure_ascii=False)274# print 'json_gudong',json_gudong275 276 277 def load_touziren(self,table_element):278 tr_element_list = table_element.find_all(class_="page-item")279 th_element_list = table_element.find_all('th')[1:-1]280 jsonarray = []281 values = {}282 id=1283 for tr_element in tr_element_list: 284 td_element_list = tr_element.find_all('td')285 col_nums = len(th_element_list)286 for i in range(col_nums):287 col_dec = th_element_list[i].get_text().strip().replace('\n','')288 col=touziren_column_dict[col_dec]289 td = td_element_list[i]290 val = td.get_text().strip()291 values[col] = val292# print col,val293 values['Investor_Info:registrationno']=self.cur_code294 values['Investor_Info:enterprisename']=self.cur_mc295 values['Investor_Info:id']=str(id)296 values['rowkey']=self.cur_mc+'_02_'+self.cur_code+'_'+self.time+str(id)297 jsonarray.append(values)298 values = {}299 id+=1300 self.json_result['Investor_Info']=jsonarray301# json_touziren=json.dumps(jsonarray,ensure_ascii=False)302# print 'json_touziren',json_touziren 303 304 def load_hehuoren(self,table_element):305 tr_element_list = table_element.find_all(class_="page-item")306 th_element_list = table_element.find_all('th')[1:-1]307 jsonarray = []308 values = {}309 id=1310 for tr_element in tr_element_list: 311 td_element_list = tr_element.find_all('td')312 col_nums = len(th_element_list)313 for i in range(col_nums):314 col_dec = th_element_list[i].get_text().strip().replace('\n','')315 col=hehuoren_column_dict[col_dec]316 td = td_element_list[i]317 val = td.get_text().strip()318 values[col] = val319# print col,val320 values['Partner_Info:registrationno']=self.cur_code321 values['Partner_Info:enterprisename']=self.cur_mc322 values['Partner_Info:id']=str(id)323 values['rowkey']=self.cur_mc+'_03_'+self.cur_code+'_'+self.time+str(id)324 jsonarray.append(values)325 values = {}326 id+=1327 self.json_result['Partner_Info']=jsonarray328# json_hehuoren=json.dumps(jsonarray,ensure_ascii=False)329# print 'json_hehuoren',json_hehuoren 330 331 332 def load_DICInfo(self, table_element):333 tr_element_list = table_element.find_all(class_="page-item")334 th_element_list = table_element.find_all('th')[1:-1]335 jsonarray = []336 values = {}337 id=1338 for tr_element in tr_element_list: 339 td_element_list = tr_element.find_all('td')340 col_nums = len(th_element_list)341 for i in range(col_nums):342 col_dec = th_element_list[i].get_text().strip().replace('\n','')343 col=DICInfo_column_dict[col_dec]344 td = td_element_list[i]345 val = td.get_text().strip()346 values[col] = val347# print col,val348 values['DIC_Info:registrationno']=self.cur_code349 values['DIC_Info:enterprisename']=self.cur_mc350 values['DIC_Info:id']=str(id)351 jsonarray.append(values)352 values = {} 353 id+=1354 # self.json_result['DIC_Info']=jsonarray355# json_DICInfo=json.dumps(jsonarray,ensure_ascii=False)356# print 'json_DICInfo',json_DICInfo 357 358 def load_biangeng(self, table_element):359 tr_element_list = table_element.find_all(class_="page-item")360 th_element_list = table_element.find_all('th')[1:-1]361 jsonarray = []362 values = {}363 id=1364 for tr_element in tr_element_list: 365 td_element_list = tr_element.find_all('td')366 col_nums = len(th_element_list)367 for i in range(col_nums):368 col_dec = th_element_list[i].text.strip().replace('\n','')369 col=biangeng_column_dict[col_dec]370 td = td_element_list[i]371 val = td.get_text().strip()372 if val.endswith(u'ÊÕÆð¸ü¶à'):373 valmore=td.find(id='allWords').get_text().strip().replace('\n','')374 values[col] = valmore375 else:376 values[col] = val377# print col,val378 values['Changed_Announcement:registrationno']=self.cur_code379 values['Changed_Announcement:enterprisename']=self.cur_mc380 values['Changed_Announcement:id']=str(id)381 values['rowkey']=self.cur_mc+'_05_'+self.cur_code+'_'+self.time+str(id)382 jsonarray.append(values)383 values = {} 384 id+=1385 self.json_result['Changed_Announcement']=jsonarray386# json_biangeng=json.dumps(jsonarray,ensure_ascii=False)387# print 'json_biangeng',json_biangeng388 389 390 def load_chexiao(self, table_element):391 pass392 def load_zhuyaorenyuan(self, table_element):393 tr_element_list = table_element.find_all(class_="page-item")394 th_element_list = table_element.find_all('th')[1:-1]395 jsonarray = []396 values = {}397 id=1398 for tr_element in tr_element_list: 399 td_element_list = tr_element.find_all('td')400 for i in range(6):401 col_dec = th_element_list[i].text.strip().replace('\n','')402 col=zhuyaorenyuan_column_dict[col_dec]403 td = td_element_list[i]404 val = td.get_text().strip()405 values[col] = val406# print th,val407 if len(values) ==3:408 if values['KeyPerson_Info:keyperson_name'] == '':409 continue410 else:411 values['KeyPerson_Info:registrationno'] = self.cur_code412 values['KeyPerson_Info:enterprisename'] = self.cur_mc413 values['KeyPerson_Info:id'] = str(id)414 values['rowkey']=self.cur_mc+'_06_'+self.cur_code+'_'+self.time+str(id)415 jsonarray.append(values)416 values = {}417 id+=1418 self.json_result['KeyPerson_Info']=jsonarray419# json_zhuyaorenyuan=json.dumps(jsonarray,ensure_ascii=False)420# print 'json_zhuyaorenyuan',json_zhuyaorenyuan421 422 423 def load_jiatingchengyuan(self, table_element):424 tr_element_list = table_element.find_all(class_="page-item")425 th_element_list = table_element.find_all('th')[1:-1]426 jsonarray = []427 values = {}428 id=1429 for tr_element in tr_element_list: 430 td_element_list = tr_element.find_all('td')431 for i in range(4):432 col_dec = th_element_list[i].text.strip().replace('\n','')433 col=jiatingchengyuan_column_dict[col_dec]434 td = td_element_list[i]435 val = td.get_text().strip()436 values[col] = val437# print th,val438 if len(values) ==2:439 values['Family_Info:registrationno']=self.cur_code440 values['Family_Info:enterprisename']=self.cur_mc441 values['Family_Info:id'] = str(id)442 values['rowkey']=self.cur_mc+'_07_'+self.cur_code+'_'+self.time+str(id)443 jsonarray.append(values)444 values = {}445 id+=1446 self.json_result['Family_Info']=jsonarray447# json_jiatingchengyuan=json.dumps(jsonarray,ensure_ascii=False)448# print 'json_jiatingchengyuan',json_jiatingchengyuan449 450 451 def load_chengyuanmingce(self, table_element):452 tr_element_list = table_element.find_all(class_="page-item")453 th_element_list = table_element.find_all('th')[1:-1]454 jsonarray = []455 values = {}456 for tr_element in tr_element_list: 457 td_element_list = tr_element.find_all('td')458 for i in range(4):459 col_dec = th_element_list[i].text.strip().replace('\n','')460 col=chengyuanmingce_column_dict[col_dec]461 td = td_element_list[i]462 val = td.get_text().strip()463 values[col] = val464# print th,val465 if len(values) ==2:466 values['Members_Info:registrationno']=self.cur_code467 values['Members_Info:enterprisename']=self.cur_mc468 values['Members_Info:id'] = str(id)469 jsonarray.append(values)470 values = {}471 # self.json_result['Members_Info']=jsonarray472# json_jiatingchengyuan=json.dumps(jsonarray,ensure_ascii=False)473# print 'json_jiatingchengyuan',json_jiatingchengyuan474 475 def load_fenzhijigou(self, table_element):476 tr_element_list = table_element.find_all(class_="page-item")477 th_element_list = table_element.find_all('th')[1:-1]478 jsonarray = []479 values = {}480 id=1481 for tr_element in tr_element_list: 482 td_element_list = tr_element.find_all('td')483 col_nums = len(th_element_list)484 for i in range(col_nums):485 col_dec = th_element_list[i].text.strip().replace('\n','')486 col=fenzhijigou_column_dict[col_dec]487 td = td_element_list[i]488 val = td.get_text().strip()489 values[col] = val490# print col,val491 values['Branches:registrationno']=self.cur_code492 values['Branches:enterprisename']=self.cur_mc493 values['Branches:id'] = str(id)494 values['rowkey']=self.cur_mc+'_08_'+self.cur_code+'_'+self.time+str(id)495 jsonarray.append(values)496 values = {}497 id+=1498 self.json_result['Branches']=jsonarray499# json_fenzhijigou=json.dumps(jsonarray,ensure_ascii=False)500# print 'json_fenzhijigou',json_fenzhijigou501 502 503 # ¼ÓÔØÇåËãÐÅÏ¢504 def load_qingsuan(self, table_element):505 tr_element_list = table_element.find_all('tr')[1:]506 # th_element_list = table_element.find_all('th')[1:-1]507 jsonarray = []508 values = {}509 for tr_element in tr_element_list:510 col_desc = tr_element.find('th').get_text().strip()511 col = qing_suan_dict[col_desc]512 td_list = tr_element.find_all('td')513 td_va = []514 for td in td_list:515 va = td.get_text().strip()516 td_va.append(va)517 val = ','.join(td_va)518 values[col] = val519 values['liquidation_Information:registrationno']=self.cur_code520 values['liquidation_Information:enterprisename']=self.cur_mc521 values['rowkey']=self.cur_mc+'_09_'+self.cur_code+'_'522 jsonarray.append(values)523 values = {}524 self.json_result['liquidation_Information']=jsonarray525# json_fenzhijigou=json.dumps(jsonarray,ensure_ascii=False)526# print 'json_fenzhijigou',json_fenzhijigou527 def load_dongchandiyadengji(self, table_element): 528 tr_element_list = table_element.find_all(class_="page-item")529 th_element_list = table_element.find_all('th')[1:-1]530 jsonarray = []531 values = {}532 id=1533 for tr_element in tr_element_list: 534 td_element_list = tr_element.find_all('td')535 col_nums = len(th_element_list)536 for i in range(col_nums):537 col_dec = th_element_list[i].text.strip().replace('\n','')538 col=dongchandiyadengji_column_dict[col_dec]539 td = td_element_list[i]540 val = td.get_text().strip()541 if val == u'ÏêÇé':542 link=td.a['href']543# print 'detail_link',self.detail_link544 values[col] = link545 else:546 values[col] = val547 values['Chattel_Mortgage:registrationno']=self.cur_code548 values['Chattel_Mortgage:enterprisename']=self.cur_mc549 values['Chattel_Mortgage:id'] = str(id)550 values['rowkey']=self.cur_mc+'_11_'+self.cur_code+'_'+self.time+str(id)551 jsonarray.append(values)552 values = {}553 id+=1554 self.json_result['Chattel_Mortgage']=jsonarray555# json_dongchandiyadengji=json.dumps(jsonarray,ensure_ascii=False)556# print 'json_dongchandiyadengji',json_dongchandiyadengji557 558 def load_guquanchuzhidengji(self, table_element):559 tr_element_list = table_element.find_all(class_="page-item")560 th_element_list = table_element.find_all('th')[1:-1]561 jsonarray = []562 values = {}563 id=1564 for tr_element in tr_element_list: 565 td_element_list = tr_element.find_all('td')566 col_nums = len(th_element_list)567 for i in range(col_nums):568 col_dec = th_element_list[i].text.strip().replace('\n','')569 previous= th_element_list[(i-1)].text.strip().replace('\n','')570 if col_dec==u'Ö¤ÕÕ/Ö¤¼þºÅÂë' and previous==u'³öÖÊÈË':571 col='Equity_Pledge:equitypledge_pledgorid'572 elif col_dec==u'Ö¤ÕÕ/Ö¤¼þºÅÂë' and previous==u'ÖÊȨÈË':573 col='Equity_Pledge:equitypledge_pawneeid'574 else:575 col=guquanchuzhidengji_column_dict[col_dec]576 td = td_element_list[i]577 val = td.get_text().strip()578 if val == u'ÏêÇé':579 link=td.a['href']580 values[col] = link581 else:582 values[col] = val583 values['Equity_Pledge:registrationno']=self.cur_code584 values['Equity_Pledge:enterprisename']=self.cur_mc585 values['Equity_Pledge:id'] = str(id)586 values['rowkey']=self.cur_mc+'_12_'+self.cur_code+'_'+self.time+str(id)587 jsonarray.append(values)588 values = {}589 id+=1590 self.json_result['Equity_Pledge']=jsonarray591# json_guquanchuzhidengji=json.dumps(jsonarray,ensure_ascii=False)592# print 'json_guquanchuzhidengji',json_guquanchuzhidengji593 594 595 def load_xingzhengchufa(self, table_element):596 tr_element_list = table_element.find_all(class_="page-item")597 th_element_list = table_element.find_all('th')[1:-1]598 jsonarray = []599 values = {}600 id=1601 for tr_element in tr_element_list: 602 td_element_list = tr_element.find_all('td')603 col_nums = len(th_element_list)604 for i in range(col_nums):605 col_dec = th_element_list[i].text.strip().replace('\n','')606 col=xingzhengchufa_column_dict[col_dec]607 td = td_element_list[i]608 val = td.get_text().strip()609 if val == u'ÏêÇé':610 link=td.a['href']611# print 'detail_link',self.detail_link612 values[col] = link613 else:614 values[col] = val615 values['Administrative_Penalty:registrationno']=self.cur_code616 values['Administrative_Penalty:enterprisename']=self.cur_mc617 values['Administrative_Penalty:id'] = str(id)618 values['rowkey']=self.cur_mc+'_13_'+self.cur_code+'_'+self.time+str(id)619 jsonarray.append(values)620 values = {}621 id+=1622 self.json_result['Administrative_Penalty']=jsonarray623# json_xingzhengchufa=json.dumps(jsonarray,ensure_ascii=False)624# print 'json_xingzhengchufa',json_xingzhengchufa625 626 627 def load_jingyingyichang(self, table_element):628 tr_element_list = table_element.find_all(class_="page-item")629 th_element_list = table_element.find_all('th')[1:-1]630 jsonarray = []631 values = {}632 id=1633 for tr_element in tr_element_list: 634 td_element_list = tr_element.find_all('td')635 col_nums = len(th_element_list)636 for i in range(col_nums):637 col_dec = th_element_list[i].text.strip().replace('\n','')638 col=jingyingyichang_column_dict[col_dec]639 td = td_element_list[i]640 val = td.get_text().strip()641 values[col] = val642# print col,val643 values['Business_Abnormal:registrationno']=self.cur_code644 values['Business_Abnormal:enterprisename']=self.cur_mc645 values['Business_Abnormal:id'] = str(id)646 values['rowkey']=self.cur_mc+'_14_'+self.cur_code+'_'+self.time+str(id)647 jsonarray.append(values)648 values = {}649 id+=1650 self.json_result['Business_Abnormal']=jsonarray651# json_jingyingyichang=json.dumps(jsonarray,ensure_ascii=False)652# print 'json_jingyingyichang',json_jingyingyichang653 654 def load_yanzhongweifa(self, table_element):655 tr_element_list = table_element.find_all(class_="page-item")656 th_element_list = table_element.find_all('th')[1:-1]657 jsonarray = []658 values = {}659 id=1660 for tr_element in tr_element_list: 661 td_element_list = tr_element.find_all('td')662 col_nums = len(th_element_list)663 for i in range(col_nums):664 col_dec = th_element_list[i].text.strip().replace('\n','')665 col=yanzhongweifa_column_dict[col_dec]666 td = td_element_list[i]667 val = td.get_text().strip()668 values[col] = val669# print col,val670 values['Serious_Violations:registrationno']=self.cur_code671 values['Serious_Violations:enterprisename']=self.cur_mc672 values['Serious_Violations:id'] = str(id)673 values['rowkey']=self.cur_mc+'_15_'+self.cur_code+'_'+self.time+str(id)674 jsonarray.append(values)675 values = {}676 id+=1677 self.json_result['Serious_Violations']=jsonarray678# json_yanzhongweifa=json.dumps(jsonarray,ensure_ascii=False)679# print 'json_yanzhongweifa',json_yanzhongweifa680 681 682 def load_chouchajiancha(self, table_element):683 tr_element_list = table_element.find_all(class_="page-item")684 th_element_list = table_element.find_all('th')[1:-1]685 jsonarray = []686 values = {}687 id=1688 for tr_element in tr_element_list: 689 td_element_list = tr_element.find_all('td')690 col_nums = len(th_element_list)691 for i in range(col_nums):692 col_dec = th_element_list[i].text.strip().replace('\n','')693 col=chouchajiancha_column_dict[col_dec]694 td = td_element_list[i]695 val = td.get_text().strip()696 values[col] = val697# print col,val698 values['Spot_Check:registrationno']=self.cur_code699 values['Spot_Check:enterprisename']=self.cur_mc700 values['Spot_Check:id'] = str(id)701 values['rowkey']=self.cur_mc+'_16_'+self.cur_code+'_'+self.time+str(id)702 jsonarray.append(values)703 values = {}704 id+=1705 self.json_result['Spot_Check']=jsonarray706# json_chouchajiancha=json.dumps(jsonarray,ensure_ascii=False)707# print 'json_chouchajiancha',json_chouchajiancha708if __name__ == '__main__':709 args_dict = get_args()710 searcher = FuJianSearcher()711 # searcher.submit_search_request(u'Äþ»¯ÏØÍúöÎÔ´Ô°ÒÕ¹¤³ÌÓÐÏÞ¹«Ë¾')...

Full Screen

Full Screen

FuJianSearcher.py

Source:FuJianSearcher.py Github

copy

Full Screen

1# coding=gbk2import PackageTool3import requests4import os5from PIL import Image6from bs4 import BeautifulSoup7import json8import re9from FuJianConfig import *10import datetime11from requests.exceptions import RequestException12import sys13import time14from gs import MSSQL15import random16import subprocess17# from Tables_dict import *18from gs.Searcher import Searcher19from gs.Searcher import get_args20from gs.KafkaAPI import KafkaAPI21import requests22requests.packages.urllib3.disable_warnings()23class FuJianSearcher(Searcher):24 search_result_json = None25 pattern = re.compile("\s")26 cur_mc = ''27 cur_code = ''28 json_result_data = []29 today = None30 # kafka = KafkaAPI("GSCrawlerTest")31 session_token = None32 cur_time = None33 verify_ip = None34 # save_tag_a = None35 load_func_dict = {}36 def __init__(self):37 super(FuJianSearcher, self).__init__(use_proxy=True)38 self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:47.0) Gecko/20100101 Firefox/47.0",39 "Host": "wsgs.fjaic.gov.cn",40 "Accept": "*/*",41 "Accept-Encoding": "gzip, deflate",42 "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",43 "Connection": "keep-alive",44 "Referer": "http://wsgs.fjaic.gov.cn/creditpub/home",45 "Upgrade-Insecure-Requests": "1",46 "Content-type": "application/json"47 }48 # self.cur_time = '%d' % (time.time() * 1000)49 self.get_session_token()50 self.get_verify_ip()51 # self.json_result = {}52 self.set_config()53 time=datetime.datetime.now()54 self.time=time.strftime('%Y%m%d')55 self.load_func_dict[u'¶¯²úµÖѺµÇ¼ÇÐÅÏ¢'] = self.load_dongchandiyadengji56 self.load_func_dict[u'¶¯²úµÖѺÐÅÏ¢'] = self.load_dongchandiyadengji57 self.load_func_dict[u'¹ÉȨ³öÖʵǼÇÐÅÏ¢'] = self.load_guquanchuzhidengji58 self.load_func_dict[u'ÐÐÕþ´¦·£ÐÅÏ¢'] = self.load_xingzhengchufa59 self.load_func_dict[u'¾­ÓªÒì³£ÐÅÏ¢'] = self.load_jingyingyichang60 self.load_func_dict[u'ÑÏÖØÎ¥·¨ÐÅÏ¢'] = self.load_yanzhongweifa61 self.load_func_dict[u'ÑÏÖØÎ¥·¨Ê§ÐÅÐÅÏ¢'] = self.load_yanzhongweifa62 self.load_func_dict[u'³é²é¼ì²éÐÅÏ¢'] = self.load_chouchajiancha63 self.load_func_dict[u'»ù±¾ÐÅÏ¢'] = self.load_jiben64 self.load_func_dict[u'¹É¶«ÐÅÏ¢'] = self.load_gudong65 self.load_func_dict[u'·¢ÆðÈËÐÅÏ¢'] = self.load_gudong66 self.load_func_dict[u'±ä¸üÐÅÏ¢'] = self.load_biangeng67 self.load_func_dict[u'Ö÷ÒªÈËÔ±ÐÅÏ¢'] = self.load_zhuyaorenyuan68 self.load_func_dict[u'·ÖÖ§»ú¹¹ÐÅÏ¢'] = self.load_fenzhijigou69 self.load_func_dict[u'ÇåËãÐÅÏ¢'] = self.load_qingsuan70 self.load_func_dict[u'²Î¼Ó¾­ÓªµÄ¼ÒÍ¥³ÉÔ±ÐÕÃû'] = self.load_jiatingchengyuan # Modified by Jing71 self.load_func_dict[u'Ͷ×ÊÈËÐÅÏ¢'] = self.load_touziren #Modified by Jing72 self.load_func_dict[u'ºÏ»ïÈËÐÅÏ¢'] = self.load_hehuoren #Modified by Jing 73 self.load_func_dict[u'³ÉÔ±Ãû²á'] = self.load_chengyuanmingce #Modified by Jing74 self.load_func_dict[u'³·ÏúÐÅÏ¢'] = self.load_chexiao #Modified by Jing75 self.load_func_dict[u'Ö÷¹Ü²¿ÃÅ£¨³ö×ÊÈË£©ÐÅÏ¢'] = self.load_DICInfo #Modified by Jing76 def set_config(self):77 self.plugin_path = os.path.join(sys.path[0], '../fu_jian/ocr/type34.bat')78 self.group = 'Crawler' # Õýʽ79 self.kafka = KafkaAPI("GSCrawlerResult") # Õýʽ80 # self.group = 'CrawlerTest' # ²âÊÔ81 # self.kafka = KafkaAPI("GSCrawlerTest") # ²âÊÔ82 self.topic = 'GsSrc35'83 self.province = u'¸£½¨Ê¡'84 self.kafka.init_producer()85 def get_verify_ip(self):86 url = 'http://wsgs.fjaic.gov.cn/creditpub/security/verify_ip'87 r = self.post_request(url, timeout=20)#,verify=False88 self.verify_ip = r.text89 def get_verify_keyword(self, keyword):90 url = "http://wsgs.fjaic.gov.cn/creditpub/security/verify_keyword"91 params = {'keyword': keyword}92 r = self.post_request(url, params, timeout=20)#,verify=False93 return r.text94 def get_validate_image_save_path(self):95 return os.path.join(sys.path[0], '../temp/' + str(random.random())[2:] + '.png')96 def get_validate_file_path(self):97 return os.path.join(sys.path[0], '../temp/' + str(random.random())[2:] + '.txt')98 99 def recognize_yzm(self,validate_path,validate_result_path):100 cmd = self.plugin_path + " " + validate_path+ " " + validate_result_path101 # print cmd102 p=subprocess.Popen(cmd.encode('gbk','ignore'), stdout=subprocess.PIPE)103 p.communicate()104 fo = open(validate_result_path,'r')105 answer=fo.readline().strip()106 fo.close()107 print 'answer: '+answer.decode('gbk', 'ignore')108 os.remove(validate_path)109 os.remove(validate_result_path)110 return answer.decode('gbk', 'ignore')111 def get_yzm(self):112 params = {'ra': '%.15f' % random.random(), 'preset:': ''}113 image_url = 'http://wsgs.fjaic.gov.cn/creditpub/captcha'114 r = self.get_request(image_url, params, timeout=20)#, verify=False115 # print r.headers116 yzm_path = self.get_validate_image_save_path()117 with open(yzm_path, 'wb') as f:118 for chunk in r.iter_content(chunk_size=1024):119 if chunk: # filter out keep-alive new chunks120 f.write(chunk)121 f.flush()122 f.close()123 yzm_file_path =self.get_validate_file_path()124 yzm = self.recognize_yzm(yzm_path,yzm_file_path) 125 return yzm 126 def get_session_token(self):127 r = self.get_request('http://wsgs.fjaic.gov.cn/creditpub/home', timeout=20)#,verify=False128 # print r.text129 idx_1 = r.text.index('session.token": "') + len('session.token": "')130 idx_2 = r.text.index('"', idx_1)131 self.session_token = r.text[idx_1:idx_2]132 def get_tag_a_from_page(self, keyword):133 tag_a = None134 for t in range(10):135 self.get_verify_keyword(keyword)136# self.today = str(datetime.date.today()).replace('-', '')137 yzm = self.get_yzm()138 url_1 = 'http://wsgs.fjaic.gov.cn/creditpub/security/verify_captcha'139 params_1 = {'captcha':yzm, 'session.token': self.session_token}140 r_1 = self.post_request(url=url_1, params=params_1, timeout=20) #, verify=False141 # print r_1, r_1.text142 if r_1.text != '0':143 url_2 = 'http://wsgs.fjaic.gov.cn/creditpub/search/ent_info_list'144 params_2 = {'captcha': yzm, 'condition.keyword': keyword, 'searchType': '1', 'session.token': self.session_token}145 r_2 = self.post_request(url=url_2, params=params_2, timeout=20) #, verify=False146 r_2.encoding = 'utf-8'147 if u'ÄúËÑË÷µÄÌõ¼þÎÞ²éѯ½á¹û' not in r_2.text:148 soup = BeautifulSoup(r_2.text, 'lxml')149 content=soup.find(class_='list-info')150 corp=content.find(class_='list-item')151 self.cur_mc = corp.find(class_='link').get_text().strip().replace('(', u'£¨').replace(')', u'£©')152 if keyword == self.cur_mc:153 self.cur_code = corp.find(class_='profile').span.get_text().strip()154 tag_a = corp.find(class_='link').a['href']155 break156 return tag_a157 def get_search_args(self, tag_a, keyword):158 if tag_a:159 return [tag_a]160 else:161 return []162 def parse_detail(self, args):163 page = args[0]164 r2 = self.post_request(page, timeout=20)165 if u'¸ÃÊг¡Ö÷Ìå²»ÔÚ¹«Ê¾·¶Î§' not in r2.text:166 resdetail = BeautifulSoup(r2.text, 'lxml')167 print 'self.save_tag_a'168 print self.save_tag_a169 if not self.save_tag_a:170 li_list = resdetail.select('html body.layout div.main div.notice ul li')171 mc = li_list[0].text172 code = li_list[1].text173 # title_bar = resdetail.find(class_='title-bar clearfix')174 # if not title_bar:175 # print '************************************'176 # print r2.text177 # mc=title_bar.find('li')178 # code=mc.find_next('li')179 self.cur_mc=mc.strip()180 self.cur_code=code.strip()[13:]181 print self.cur_mc,self.cur_code182 div_element_list = resdetail.find_all(class_='hide') #(style="display: none;")183 for div_element in div_element_list:184 table_element_list = div_element.find_all('table')185 for table_element in table_element_list:186 row_cnt=len(table_element.find_all("tr"))187 # print 'row_cnt',row_cnt188 table_desc = table_element.find("th").get_text().strip().split('\n')[0]189 if table_desc in self.load_func_dict:190 if table_desc == u'ÇåËãÐÅÏ¢':191 self.load_func_dict[table_desc](table_element)192 elif row_cnt > 3:193 self.load_func_dict[table_desc](table_element)194 else:195 raise Exception("unknown table!")196 else:197 print u'¸ÃÊг¡Ö÷Ìå²»ÔÚ¹«Ê¾·¶Î§'198 def load_jiben(self,table_element):199 jsonarray = []200 tr_element_list = table_element.find_all("tr")201 values = {}202 for tr_element in tr_element_list[1:]:203 th_element_list = tr_element.find_all('th')204 td_element_list = tr_element.find_all('td')205 if len(th_element_list) == len(td_element_list):206 col_nums = len(th_element_list)207 for i in range(col_nums):208 col_dec = th_element_list[i].get_text().strip().replace('\n','')209 col=jiben_column_dict[col_dec]210 val = td_element_list[i].get_text().strip().replace('\n','')211 if col != u'':212 values[col] = val213 if col == 'Registered_Info:registrationno':214 if len(val) == 18:215 values['Registered_Info:tyshxy_code'] = val216 else:217 values['Registered_Info:zch'] = val218# print col,val219 values['Registered_Info:province']=self.province220 values['rowkey']=self.cur_mc+'_01_'+self.cur_code+'_'221 jsonarray.append(values)222 self.json_result['Registered_Info']=jsonarray223# json_jiben=json.dumps(jsonarray,ensure_ascii=False)224# print 'json_jiben',json_jiben225 226 227 def load_gudong(self, table_element):228 tr_element_list = table_element.find_all(class_="page-item")229 th_element_list = table_element.find_all('th')[1:-1]230 jsonarray = []231 values = {}232 id=1233 for tr_element in tr_element_list: 234 td_element_list = tr_element.find_all('td')235 col_nums = len(th_element_list)236 for i in range(col_nums):237 col_dec = th_element_list[i].get_text().strip().replace('\n','')238 col=gudong_column_dict[col_dec]239 td = td_element_list[i]240 val = td.get_text().strip()241 if val == u'ÏêÇé':242 link=td.a['href']243 detail_th_list = ['Shareholder_Info:subscripted_capital','Shareholder_Info:actualpaid_capital',244 'Shareholder_Info:subscripted_method','Shareholder_Info:subscripted_amount',245 'Shareholder_Info:subscripted_time','Shareholder_Info:actualpaid_method',246 'Shareholder_Info:actualpaid_amount','Shareholder_Info:actualpaid_time']247 r2 = self.get_request(link)248 resdetail = r2.text249 htmldetail = BeautifulSoup(resdetail,'html.parser')250 detail_content = htmldetail.find(class_="info m-bottom m-top")251 detail_tr_list = detail_content.find_all('tr')252 if len(detail_tr_list)>3:253 for tr_ele in detail_tr_list[3:]:254 td_ele_list = tr_ele.find_all('td')[1:]255 detail_col_nums = len(td_ele_list) 256 for m in range(detail_col_nums):257 col = detail_th_list[m]258 td=td_ele_list[m]259 val = td.text.strip()260 values[col] = val261# print col,val262 values[col] = link263 else:264 values[col] = val265 values['Shareholder_Info:registrationno']=self.cur_code266 values['Shareholder_Info:enterprisename']=self.cur_mc267 values['Shareholder_Info:id']=str(id)268 values['rowkey']=self.cur_mc+'_04_'+self.cur_code+'_'+self.time+str(id)269 jsonarray.append(values)270 values = {}271 id+=1272 self.json_result['Shareholder_Info']=jsonarray273# json_gudong=json.dumps(jsonarray,ensure_ascii=False)274# print 'json_gudong',json_gudong275 276 277 def load_touziren(self,table_element):278 tr_element_list = table_element.find_all(class_="page-item")279 th_element_list = table_element.find_all('th')[1:-1]280 jsonarray = []281 values = {}282 id=1283 for tr_element in tr_element_list: 284 td_element_list = tr_element.find_all('td')285 col_nums = len(th_element_list)286 for i in range(col_nums):287 col_dec = th_element_list[i].get_text().strip().replace('\n','')288 col=touziren_column_dict[col_dec]289 td = td_element_list[i]290 val = td.get_text().strip()291 values[col] = val292# print col,val293 values['Investor_Info:registrationno']=self.cur_code294 values['Investor_Info:enterprisename']=self.cur_mc295 values['Investor_Info:id']=str(id)296 values['rowkey']=self.cur_mc+'_02_'+self.cur_code+'_'+self.time+str(id)297 jsonarray.append(values)298 values = {}299 id+=1300 self.json_result['Investor_Info']=jsonarray301# json_touziren=json.dumps(jsonarray,ensure_ascii=False)302# print 'json_touziren',json_touziren 303 304 def load_hehuoren(self,table_element):305 tr_element_list = table_element.find_all(class_="page-item")306 th_element_list = table_element.find_all('th')[1:-1]307 jsonarray = []308 values = {}309 id=1310 for tr_element in tr_element_list: 311 td_element_list = tr_element.find_all('td')312 col_nums = len(th_element_list)313 for i in range(col_nums):314 col_dec = th_element_list[i].get_text().strip().replace('\n','')315 col=hehuoren_column_dict[col_dec]316 td = td_element_list[i]317 val = td.get_text().strip()318 values[col] = val319# print col,val320 values['Partner_Info:registrationno']=self.cur_code321 values['Partner_Info:enterprisename']=self.cur_mc322 values['Partner_Info:id']=str(id)323 values['rowkey']=self.cur_mc+'_03_'+self.cur_code+'_'+self.time+str(id)324 jsonarray.append(values)325 values = {}326 id+=1327 self.json_result['Partner_Info']=jsonarray328# json_hehuoren=json.dumps(jsonarray,ensure_ascii=False)329# print 'json_hehuoren',json_hehuoren 330 331 332 def load_DICInfo(self, table_element):333 tr_element_list = table_element.find_all(class_="page-item")334 th_element_list = table_element.find_all('th')[1:-1]335 jsonarray = []336 values = {}337 id=1338 for tr_element in tr_element_list: 339 td_element_list = tr_element.find_all('td')340 col_nums = len(th_element_list)341 for i in range(col_nums):342 col_dec = th_element_list[i].get_text().strip().replace('\n','')343 col=DICInfo_column_dict[col_dec]344 td = td_element_list[i]345 val = td.get_text().strip()346 values[col] = val347# print col,val348 values['DIC_Info:registrationno']=self.cur_code349 values['DIC_Info:enterprisename']=self.cur_mc350 values['DIC_Info:id']=str(id)351 jsonarray.append(values)352 values = {} 353 id+=1354 # self.json_result['DIC_Info']=jsonarray355# json_DICInfo=json.dumps(jsonarray,ensure_ascii=False)356# print 'json_DICInfo',json_DICInfo 357 358 def load_biangeng(self, table_element):359 tr_element_list = table_element.find_all(class_="page-item")360 th_element_list = table_element.find_all('th')[1:-1]361 jsonarray = []362 values = {}363 id=1364 for tr_element in tr_element_list: 365 td_element_list = tr_element.find_all('td')366 col_nums = len(th_element_list)367 for i in range(col_nums):368 col_dec = th_element_list[i].text.strip().replace('\n','')369 col=biangeng_column_dict[col_dec]370 td = td_element_list[i]371 val = td.get_text().strip()372 if val.endswith(u'ÊÕÆð¸ü¶à'):373 valmore=td.find(id='allWords').get_text().strip().replace('\n','')374 values[col] = valmore375 else:376 values[col] = val377# print col,val378 values['Changed_Announcement:registrationno']=self.cur_code379 values['Changed_Announcement:enterprisename']=self.cur_mc380 values['Changed_Announcement:id']=str(id)381 values['rowkey']=self.cur_mc+'_05_'+self.cur_code+'_'+self.time+str(id)382 jsonarray.append(values)383 values = {} 384 id+=1385 self.json_result['Changed_Announcement']=jsonarray386# json_biangeng=json.dumps(jsonarray,ensure_ascii=False)387# print 'json_biangeng',json_biangeng388 389 390 def load_chexiao(self, table_element):391 pass392 def load_zhuyaorenyuan(self, table_element):393 tr_element_list = table_element.find_all(class_="page-item")394 th_element_list = table_element.find_all('th')[1:-1]395 jsonarray = []396 values = {}397 id=1398 for tr_element in tr_element_list: 399 td_element_list = tr_element.find_all('td')400 for i in range(6):401 col_dec = th_element_list[i].text.strip().replace('\n','')402 col=zhuyaorenyuan_column_dict[col_dec]403 td = td_element_list[i]404 val = td.get_text().strip()405 values[col] = val406# print th,val407 if len(values) ==3:408 if values['KeyPerson_Info:keyperson_name'] == '':409 continue410 else:411 values['KeyPerson_Info:registrationno'] = self.cur_code412 values['KeyPerson_Info:enterprisename'] = self.cur_mc413 values['KeyPerson_Info:id'] = str(id)414 values['rowkey']=self.cur_mc+'_06_'+self.cur_code+'_'+self.time+str(id)415 jsonarray.append(values)416 values = {}417 id+=1418 self.json_result['KeyPerson_Info']=jsonarray419# json_zhuyaorenyuan=json.dumps(jsonarray,ensure_ascii=False)420# print 'json_zhuyaorenyuan',json_zhuyaorenyuan421 422 423 def load_jiatingchengyuan(self, table_element):424 tr_element_list = table_element.find_all(class_="page-item")425 th_element_list = table_element.find_all('th')[1:-1]426 jsonarray = []427 values = {}428 id=1429 for tr_element in tr_element_list: 430 td_element_list = tr_element.find_all('td')431 for i in range(4):432 col_dec = th_element_list[i].text.strip().replace('\n','')433 col=jiatingchengyuan_column_dict[col_dec]434 td = td_element_list[i]435 val = td.get_text().strip()436 values[col] = val437# print th,val438 if len(values) ==2:439 values['Family_Info:registrationno']=self.cur_code440 values['Family_Info:enterprisename']=self.cur_mc441 values['Family_Info:id'] = str(id)442 values['rowkey']=self.cur_mc+'_07_'+self.cur_code+'_'+self.time+str(id)443 jsonarray.append(values)444 values = {}445 id+=1446 self.json_result['Family_Info']=jsonarray447# json_jiatingchengyuan=json.dumps(jsonarray,ensure_ascii=False)448# print 'json_jiatingchengyuan',json_jiatingchengyuan449 450 451 def load_chengyuanmingce(self, table_element):452 tr_element_list = table_element.find_all(class_="page-item")453 th_element_list = table_element.find_all('th')[1:-1]454 jsonarray = []455 values = {}456 for tr_element in tr_element_list: 457 td_element_list = tr_element.find_all('td')458 for i in range(4):459 col_dec = th_element_list[i].text.strip().replace('\n','')460 col=chengyuanmingce_column_dict[col_dec]461 td = td_element_list[i]462 val = td.get_text().strip()463 values[col] = val464# print th,val465 if len(values) ==2:466 values['Members_Info:registrationno']=self.cur_code467 values['Members_Info:enterprisename']=self.cur_mc468 values['Members_Info:id'] = str(id)469 jsonarray.append(values)470 values = {}471 # self.json_result['Members_Info']=jsonarray472# json_jiatingchengyuan=json.dumps(jsonarray,ensure_ascii=False)473# print 'json_jiatingchengyuan',json_jiatingchengyuan474 475 def load_fenzhijigou(self, table_element):476 tr_element_list = table_element.find_all(class_="page-item")477 th_element_list = table_element.find_all('th')[1:-1]478 jsonarray = []479 values = {}480 id=1481 for tr_element in tr_element_list: 482 td_element_list = tr_element.find_all('td')483 col_nums = len(th_element_list)484 for i in range(col_nums):485 col_dec = th_element_list[i].text.strip().replace('\n','')486 col=fenzhijigou_column_dict[col_dec]487 td = td_element_list[i]488 val = td.get_text().strip()489 values[col] = val490# print col,val491 values['Branches:registrationno']=self.cur_code492 values['Branches:enterprisename']=self.cur_mc493 values['Branches:id'] = str(id)494 values['rowkey']=self.cur_mc+'_08_'+self.cur_code+'_'+self.time+str(id)495 jsonarray.append(values)496 values = {}497 id+=1498 self.json_result['Branches']=jsonarray499# json_fenzhijigou=json.dumps(jsonarray,ensure_ascii=False)500# print 'json_fenzhijigou',json_fenzhijigou501 502 503 # ¼ÓÔØÇåËãÐÅÏ¢504 def load_qingsuan(self, table_element):505 tr_element_list = table_element.find_all('tr')[1:]506 # th_element_list = table_element.find_all('th')[1:-1]507 jsonarray = []508 values = {}509 for tr_element in tr_element_list:510 col_desc = tr_element.find('th').get_text().strip()511 col = qing_suan_dict[col_desc]512 td_list = tr_element.find_all('td')513 td_va = []514 for td in td_list:515 va = td.get_text().strip()516 td_va.append(va)517 val = ','.join(td_va)518 values[col] = val519 values['liquidation_Information:registrationno']=self.cur_code520 values['liquidation_Information:enterprisename']=self.cur_mc521 values['rowkey']=self.cur_mc+'_09_'+self.cur_code+'_'522 jsonarray.append(values)523 values = {}524 self.json_result['liquidation_Information']=jsonarray525# json_fenzhijigou=json.dumps(jsonarray,ensure_ascii=False)526# print 'json_fenzhijigou',json_fenzhijigou527 def load_dongchandiyadengji(self, table_element): 528 tr_element_list = table_element.find_all(class_="page-item")529 th_element_list = table_element.find_all('th')[1:-1]530 jsonarray = []531 values = {}532 id=1533 for tr_element in tr_element_list: 534 td_element_list = tr_element.find_all('td')535 col_nums = len(th_element_list)536 for i in range(col_nums):537 col_dec = th_element_list[i].text.strip().replace('\n','')538 col=dongchandiyadengji_column_dict[col_dec]539 td = td_element_list[i]540 val = td.get_text().strip()541 if val == u'ÏêÇé':542 link=td.a['href']543# print 'detail_link',self.detail_link544 values[col] = link545 else:546 values[col] = val547 values['Chattel_Mortgage:registrationno']=self.cur_code548 values['Chattel_Mortgage:enterprisename']=self.cur_mc549 values['Chattel_Mortgage:id'] = str(id)550 values['rowkey']=self.cur_mc+'_11_'+self.cur_code+'_'+self.time+str(id)551 jsonarray.append(values)552 values = {}553 id+=1554 self.json_result['Chattel_Mortgage']=jsonarray555# json_dongchandiyadengji=json.dumps(jsonarray,ensure_ascii=False)556# print 'json_dongchandiyadengji',json_dongchandiyadengji557 558 def load_guquanchuzhidengji(self, table_element):559 tr_element_list = table_element.find_all(class_="page-item")560 th_element_list = table_element.find_all('th')[1:-1]561 jsonarray = []562 values = {}563 id=1564 for tr_element in tr_element_list: 565 td_element_list = tr_element.find_all('td')566 col_nums = len(th_element_list)567 for i in range(col_nums):568 col_dec = th_element_list[i].text.strip().replace('\n','')569 previous= th_element_list[(i-1)].text.strip().replace('\n','')570 if col_dec==u'Ö¤ÕÕ/Ö¤¼þºÅÂë' and previous==u'³öÖÊÈË':571 col='Equity_Pledge:equitypledge_pledgorid'572 elif col_dec==u'Ö¤ÕÕ/Ö¤¼þºÅÂë' and previous==u'ÖÊȨÈË':573 col='Equity_Pledge:equitypledge_pawneeid'574 else:575 col=guquanchuzhidengji_column_dict[col_dec]576 td = td_element_list[i]577 val = td.get_text().strip()578 if val == u'ÏêÇé':579 link=td.a['href']580 values[col] = link581 else:582 values[col] = val583 values['Equity_Pledge:registrationno']=self.cur_code584 values['Equity_Pledge:enterprisename']=self.cur_mc585 values['Equity_Pledge:id'] = str(id)586 values['rowkey']=self.cur_mc+'_12_'+self.cur_code+'_'+self.time+str(id)587 jsonarray.append(values)588 values = {}589 id+=1590 self.json_result['Equity_Pledge']=jsonarray591# json_guquanchuzhidengji=json.dumps(jsonarray,ensure_ascii=False)592# print 'json_guquanchuzhidengji',json_guquanchuzhidengji593 594 595 def load_xingzhengchufa(self, table_element):596 tr_element_list = table_element.find_all(class_="page-item")597 th_element_list = table_element.find_all('th')[1:-1]598 jsonarray = []599 values = {}600 id=1601 for tr_element in tr_element_list: 602 td_element_list = tr_element.find_all('td')603 col_nums = len(th_element_list)604 for i in range(col_nums):605 col_dec = th_element_list[i].text.strip().replace('\n','')606 col=xingzhengchufa_column_dict[col_dec]607 td = td_element_list[i]608 val = td.get_text().strip()609 if val == u'ÏêÇé':610 link=td.a['href']611# print 'detail_link',self.detail_link612 values[col] = link613 else:614 values[col] = val615 values['Administrative_Penalty:registrationno']=self.cur_code616 values['Administrative_Penalty:enterprisename']=self.cur_mc617 values['Administrative_Penalty:id'] = str(id)618 values['rowkey']=self.cur_mc+'_13_'+self.cur_code+'_'+self.time+str(id)619 jsonarray.append(values)620 values = {}621 id+=1622 self.json_result['Administrative_Penalty']=jsonarray623# json_xingzhengchufa=json.dumps(jsonarray,ensure_ascii=False)624# print 'json_xingzhengchufa',json_xingzhengchufa625 626 627 def load_jingyingyichang(self, table_element):628 tr_element_list = table_element.find_all(class_="page-item")629 th_element_list = table_element.find_all('th')[1:-1]630 jsonarray = []631 values = {}632 id=1633 for tr_element in tr_element_list: 634 td_element_list = tr_element.find_all('td')635 col_nums = len(th_element_list)636 for i in range(col_nums):637 col_dec = th_element_list[i].text.strip().replace('\n','')638 col=jingyingyichang_column_dict[col_dec]639 td = td_element_list[i]640 val = td.get_text().strip()641 values[col] = val642# print col,val643 values['Business_Abnormal:registrationno']=self.cur_code644 values['Business_Abnormal:enterprisename']=self.cur_mc645 values['Business_Abnormal:id'] = str(id)646 values['rowkey']=self.cur_mc+'_14_'+self.cur_code+'_'+self.time+str(id)647 jsonarray.append(values)648 values = {}649 id+=1650 self.json_result['Business_Abnormal']=jsonarray651# json_jingyingyichang=json.dumps(jsonarray,ensure_ascii=False)652# print 'json_jingyingyichang',json_jingyingyichang653 654 def load_yanzhongweifa(self, table_element):655 tr_element_list = table_element.find_all(class_="page-item")656 th_element_list = table_element.find_all('th')[1:-1]657 jsonarray = []658 values = {}659 id=1660 for tr_element in tr_element_list: 661 td_element_list = tr_element.find_all('td')662 col_nums = len(th_element_list)663 for i in range(col_nums):664 col_dec = th_element_list[i].text.strip().replace('\n','')665 col=yanzhongweifa_column_dict[col_dec]666 td = td_element_list[i]667 val = td.get_text().strip()668 values[col] = val669# print col,val670 values['Serious_Violations:registrationno']=self.cur_code671 values['Serious_Violations:enterprisename']=self.cur_mc672 values['Serious_Violations:id'] = str(id)673 values['rowkey']=self.cur_mc+'_15_'+self.cur_code+'_'+self.time+str(id)674 jsonarray.append(values)675 values = {}676 id+=1677 self.json_result['Serious_Violations']=jsonarray678# json_yanzhongweifa=json.dumps(jsonarray,ensure_ascii=False)679# print 'json_yanzhongweifa',json_yanzhongweifa680 681 682 def load_chouchajiancha(self, table_element):683 tr_element_list = table_element.find_all(class_="page-item")684 th_element_list = table_element.find_all('th')[1:-1]685 jsonarray = []686 values = {}687 id=1688 for tr_element in tr_element_list: 689 td_element_list = tr_element.find_all('td')690 col_nums = len(th_element_list)691 for i in range(col_nums):692 col_dec = th_element_list[i].text.strip().replace('\n','')693 col=chouchajiancha_column_dict[col_dec]694 td = td_element_list[i]695 val = td.get_text().strip()696 values[col] = val697# print col,val698 values['Spot_Check:registrationno']=self.cur_code699 values['Spot_Check:enterprisename']=self.cur_mc700 values['Spot_Check:id'] = str(id)701 values['rowkey']=self.cur_mc+'_16_'+self.cur_code+'_'+self.time+str(id)702 jsonarray.append(values)703 values = {}704 id+=1705 self.json_result['Spot_Check']=jsonarray706# json_chouchajiancha=json.dumps(jsonarray,ensure_ascii=False)707# print 'json_chouchajiancha',json_chouchajiancha708if __name__ == '__main__':709 args_dict = get_args()710 searcher = FuJianSearcher()711 # searcher.submit_search_request(u'Äþ»¯ÏØÍúöÎÔ´Ô°ÒÕ¹¤³ÌÓÐÏÞ¹«Ë¾')...

Full Screen

Full Screen

total.py

Source:total.py Github

copy

Full Screen

12#####################################################################3############ 추출된 핵심 어구를 그대로 연도 별로 넣은 버전 ##############4#####################################################################56import json78with open('total.json', 'rt', encoding='UTF8') as f:9 json_data = json.load(f)1011jsonArray = json_data.get("value") # 총 5433개의 문서12#print(jsonArray)1314print(len(jsonArray))15#print(jsonArray[0])16#print(type(jsonArray[0]['publication_year']))17year2002 =[]18year2003 =[]19year2004 =[]20year2005 =[]21year2006 =[]22year2007 =[]23year2008 =[]24year2009 =[]25year2010 =[]26year2011 =[]27year2012 =[]28year2013 =[]29year2014 =[]30year2015 =[]31year2016 =[]32year2017 =[]33year2018 =[]34year2019 =[]35year2020 =[]36year2021 =[]3738#print(len(jsonArray[0].get("keyphrases"))) # keyphrases 개수 확인용3940for i in range(100): # 검색하려는 개수 만큼 range 입력 (100 입력하면 "5433 문서 중 100개의 문서 뽑겠다!!)41 if jsonArray[i]['publication_year'] == 2002 :42 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 43 year2002.append(jsonArray[i].get("keyphrases")[j])44 # 이 경우 예를 들면, jsonArray[0] 한 군데에서 SOA라는 단어가 여러 번 나오는 경우 고려 X !! 45 # jsonArray[0]의 keyphrases에서 각각의 '핵심 구'를 '한 단어'로 쪼갠다 → 저장 → 중복 제거한다.???? 46 47 if jsonArray[i]['publication_year'] == 2003 :48 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 49 year2003.append(jsonArray[i].get("keyphrases")[j])50 51 if jsonArray[i]['publication_year'] == 2004 :52 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 53 year2004.append(jsonArray[i].get("keyphrases")[j])54 55 if jsonArray[i]['publication_year'] == 2005 :56 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 57 year2005.append(jsonArray[i].get("keyphrases")[j])58 59 if jsonArray[i]['publication_year'] == 2006 :60 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 61 year2006.append(jsonArray[i].get("keyphrases")[j])62 63 if jsonArray[i]['publication_year'] == 2007 :64 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 65 year2007.append(jsonArray[i].get("keyphrases")[j])66 67 if jsonArray[i]['publication_year'] == 2008 :68 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 69 year2008.append(jsonArray[i].get("keyphrases")[j])7071 if jsonArray[i]['publication_year'] == 2009 :72 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 73 year2009.append(jsonArray[i].get("keyphrases")[j])7475 if jsonArray[i]['publication_year'] == 2010 :76 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 77 year2010.append(jsonArray[i].get("keyphrases")[j])7879 if jsonArray[i]['publication_year'] == 2011 :80 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 81 year2011.append(jsonArray[i].get("keyphrases")[j])8283 if jsonArray[i]['publication_year'] == 2012 :84 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 85 year2012.append(jsonArray[i].get("keyphrases")[j])8687 if jsonArray[i]['publication_year'] == 2013 :88 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 89 year2013.append(jsonArray[i].get("keyphrases")[j])9091 if jsonArray[i]['publication_year'] == 2014 :92 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 93 year2014.append(jsonArray[i].get("keyphrases")[j])9495 if jsonArray[i]['publication_year'] == 2015 :96 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 97 year2015.append(jsonArray[i].get("keyphrases")[j])9899 if jsonArray[i]['publication_year'] == 2016 :100 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 101 year2016.append(jsonArray[i].get("keyphrases")[j])102103 if jsonArray[i]['publication_year'] == 2017 :104 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 105 year2017.append(jsonArray[i].get("keyphrases")[j])106107 if jsonArray[i]['publication_year'] == 2018 :108 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 109 year2018.append(jsonArray[i].get("keyphrases")[j])110111 if jsonArray[i]['publication_year'] == 2019 :112 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 113 year2019.append(jsonArray[i].get("keyphrases")[j])114115 if jsonArray[i]['publication_year'] == 2020 :116 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 117 year2020.append(jsonArray[i].get("keyphrases")[j])118 119 if jsonArray[i]['publication_year'] == 2021 :120 for j in range(len(jsonArray[i].get("keyphrases"))): # keyphrases 개수 만큼 range 입력 121 year2021.append(jsonArray[i].get("keyphrases")[j])122123 else :124 print("다른년도입니다.")125126127print("===================================2002년=======================================\n")128print(year2002)129print("===================================2003년=======================================\n")130print(year2003)131print("===================================2004년=======================================\n")132print(year2004)133print("===================================2005년=======================================\n")134print(year2005)135print("===================================2006년=======================================\n")136print(year2006)137print("===================================2007년=======================================\n")138print(year2007)139print("===================================2008년=======================================\n")140print(year2008)141print("===================================2006년=======================================\n")142print(year2006)143print("===================================2007년=======================================\n")144print(year2007)145print("===================================2008년=======================================\n")146print(year2008)147print("===================================2009년=======================================\n")148print(year2009)149print("===================================2010년=======================================\n")150print(year2010)151print("===================================2011년=======================================\n")152print(year2011)153print("===================================2012년=======================================\n")154print(year2012)155print("===================================2013년=======================================\n")156print(year2013)157print("===================================2014년=======================================\n")158print(year2014)159print("===================================2015년=======================================\n")160print(year2015)161print("===================================2016년=======================================\n")162print(year2016)163print("===================================2017년=======================================\n")164print(year2017)165print("===================================2018년=======================================\n")166print(year2018)167print("===================================2019년=======================================\n")168print(year2019)169print("===================================2020년=======================================\n")170print(year2020)171172#key = jsonArray[0].get("keyphrases")173 ...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run autotest automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful