創(chuàng)新互聯(lián)www.cdcxhl.cn八線動態(tài)BGP香港云服務器提供商,新人活動買多久送多久,劃算不套路!
創(chuàng)新互聯(lián)建站是一家企業(yè)級云計算解決方案提供商,超15年IDC數(shù)據(jù)中心運營經(jīng)驗。主營GPU顯卡服務器,站群服務器,遂寧服務器托管,海外高防服務器,服務器機柜,動態(tài)撥號VPS,海外云手機,海外云服務器,海外服務器租用托管等。小編給大家分享一下python3解析html的方法,希望大家閱讀完這篇文章后大所收獲,下面讓我們一起去探討吧!
解析html是爬蟲后的重要的一個處理數(shù)據(jù)的環(huán)節(jié)。一下記錄解析html的幾種方式。
先介紹基礎(chǔ)的輔助函數(shù),主要用于獲取html并輸入解析后的結(jié)束。
#把傳遞解析函數(shù),便于下面的修改 def get_html(url, paraser=bs4_paraser): headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Host': 'www.360kan.com', 'Proxy-Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) response.encoding = 'utf-8' if response.code == 200: data = StringIO.StringIO(response.read()) gzipper = gzip.GzipFile(fileobj=data) data = gzipper.read() value = paraser(data) # open('E:/h6/haPkY0osd0r5UB.html').read() return value else: pass value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser) for row in value: print row
1、lxml.html的方式進行解析。
The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt. It is unique in that it combines the speed and XML feature completeness of these libraries with the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API. The latest release works with all CPython versions from 2.6 to 3.5. See the introduction for more information about background and goals of the lxml project. Some common questions are answered in the FAQ. [官網(wǎng)](http://lxml.de/)
def lxml_parser(page): data = [] doc = etree.HTML(page) all_div = doc.xpath('//div[@class="yingping-list-wrap"]') for row in all_div: # 獲取每一個影評,即影評的item all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'}) for r in all_div_item: value = {} # 獲取影評的標題部分 title = r.xpath('.//div[@class="g-clear title-wrap"][1]') value['title'] = title[0].xpath('./a/text()')[0] value['title_href'] = title[0].xpath('./a/@href')[0] score_text = title[0].xpath('./div/span/span/@style')[0] score_text = re.search(r'\d+', score_text).group() value['score'] = int(score_text) / 20 # 時間 value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0] # 多少人喜歡 value['people'] = int( re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group()) data.append(value) return data
2、使用BeautifulSoup,不多說了,大家網(wǎng)上找資料看看。
def bs4_paraser(html): all_value = [] value = {} soup = BeautifulSoup(html, 'html.parser') # 獲取影評的部分 all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1) for row in all_div: # 獲取每一個影評,即影評的item all_div_item = row.find_all('div', attrs={'class': 'item'}) for r in all_div_item: # 獲取影評的標題部分 title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1) if title is not None and len(title) > 0: value['title'] = title[0].a.string value['title_href'] = title[0].a['href'] score_text = title[0].div.span.span['style'] score_text = re.search(r'\d+', score_text).group() value['score'] = int(score_text) / 20 # 時間 value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string # 多少人喜歡 value['people'] = int( re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group()) # print r all_value.append(value) value = {} return all_value
3、使用SGMLParser,主要是通過start、end tag的方式進行,解析工程比較明朗,但是有點麻煩,而該案例的場景不太適合該方法。
class CommentParaser(SGMLParser): def __init__(self): SGMLParser.__init__(self) self.__start_div_yingping = False self.__start_div_item = False self.__start_div_gclear = False self.__start_div_ratingwrap = False self.__start_div_num = False # a self.__start_a = False # span 3中狀態(tài) self.__span_state = 0 # 數(shù)據(jù) self.__value = {} self.data = [] def start_div(self, attrs): for k, v in attrs: if k == 'class' and v == 'yingping-list-wrap': self.__start_div_yingping = True elif k == 'class' and v == 'item': self.__start_div_item = True elif k == 'class' and v == 'g-clear title-wrap': self.__start_div_gclear = True elif k == 'class' and v == 'rating-wrap g-clear': self.__start_div_ratingwrap = True elif k == 'class' and v == 'num': self.__start_div_num = True def end_div(self): if self.__start_div_yingping: if self.__start_div_item: if self.__start_div_gclear: if self.__start_div_num or self.__start_div_ratingwrap: if self.__start_div_num: self.__start_div_num = False if self.__start_div_ratingwrap: self.__start_div_ratingwrap = False else: self.__start_div_gclear = False else: self.data.append(self.__value) self.__value = {} self.__start_div_item = False else: self.__start_div_yingping = False def start_a(self, attrs): if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear: self.__start_a = True for k, v in attrs: if k == 'href': self.__value['href'] = v def end_a(self): if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a: self.__start_a = False def start_span(self, attrs): if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear: if self.__start_div_ratingwrap: if self.__span_state != 1: for k, v in attrs: if k == 'class' and v == 'rating': self.__span_state = 1 elif k == 'class' and v == 'time': self.__span_state = 2 else: for k, v in attrs: if k == 'style': score_text = re.search(r'\d+', v).group() self.__value['score'] = int(score_text) / 20 self.__span_state = 3 elif self.__start_div_num: self.__span_state = 4 def end_span(self): self.__span_state = 0 def handle_data(self, data): if self.__start_a: self.__value['title'] = data elif self.__span_state == 2: self.__value['time'] = data elif self.__span_state == 4: score_text = re.search(r'\d+', data).group() self.__value['people'] = int(score_text) pass def sgl_parser(html): parser = CommentParaser() parser.feed(html) return parser.data
4、HTMLParaer,與3原理相識,就是調(diào)用的方法不太一樣,基本上可以公用。
class CommentHTMLParser(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self.__start_div_yingping = False self.__start_div_item = False self.__start_div_gclear = False self.__start_div_ratingwrap = False self.__start_div_num = False # a self.__start_a = False # span 3中狀態(tài) self.__span_state = 0 # 數(shù)據(jù) self.__value = {} self.data = [] def handle_starttag(self, tag, attrs): if tag == 'div': for k, v in attrs: if k == 'class' and v == 'yingping-list-wrap': self.__start_div_yingping = True elif k == 'class' and v == 'item': self.__start_div_item = True elif k == 'class' and v == 'g-clear title-wrap': self.__start_div_gclear = True elif k == 'class' and v == 'rating-wrap g-clear': self.__start_div_ratingwrap = True elif k == 'class' and v == 'num': self.__start_div_num = True elif tag == 'a': if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear: self.__start_a = True for k, v in attrs: if k == 'href': self.__value['href'] = v elif tag == 'span': if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear: if self.__start_div_ratingwrap: if self.__span_state != 1: for k, v in attrs: if k == 'class' and v == 'rating': self.__span_state = 1 elif k == 'class' and v == 'time': self.__span_state = 2 else: for k, v in attrs: if k == 'style': score_text = re.search(r'\d+', v).group() self.__value['score'] = int(score_text) / 20 self.__span_state = 3 elif self.__start_div_num: self.__span_state = 4 def handle_endtag(self, tag): if tag == 'div': if self.__start_div_yingping: if self.__start_div_item: if self.__start_div_gclear: if self.__start_div_num or self.__start_div_ratingwrap: if self.__start_div_num: self.__start_div_num = False if self.__start_div_ratingwrap: self.__start_div_ratingwrap = False else: self.__start_div_gclear = False else: self.data.append(self.__value) self.__value = {} self.__start_div_item = False else: self.__start_div_yingping = False elif tag == 'a': if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a: self.__start_a = False elif tag == 'span': self.__span_state = 0 def handle_data(self, data): if self.__start_a: self.__value['title'] = data elif self.__span_state == 2: self.__value['time'] = data elif self.__span_state == 4: score_text = re.search(r'\d+', data).group() self.__value['people'] = int(score_text) pass def html_parser(html): parser = CommentHTMLParser() parser.feed(html) return parser.data
看完了這篇文章,相信你對python3解析html的方法有了一定的了解,想了解更多相關(guān)知識,歡迎關(guān)注創(chuàng)新互聯(lián)-成都網(wǎng)站建設(shè)公司行業(yè)資訊頻道,感謝各位的閱讀!