亚洲激情专区-91九色丨porny丨老师-久久久久久久女国产乱让韩-国产精品午夜小视频观看

溫馨提示×

溫馨提示×

您好,登錄后才能下訂單哦!

密碼登錄×
登錄注冊×
其他方式登錄
點擊 登錄注冊 即表示同意《億速云用戶服務條款》

Python3 中怎么解析html

發布時間:2021-06-17 15:28:27 來源:億速云 閱讀:168 作者:Leah 欄目:開發技術

Python3 中怎么解析html,針對這個問題,這篇文章詳細介紹了相對應的分析和解答,希望可以幫助更多想解決這個問題的小伙伴找到更簡單易行的方法。

輔助函數,主要用于獲取html并輸入解析后的結束

#把傳遞解析函數,便于下面的修改
def get_html(url, paraser=bs4_paraser):
 headers = {
  'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate, sdch',
  'Accept-Language': 'zh-CN,zh;q=0.8',
  'Host': 'www.360kan.com',
  'Proxy-Connection': 'keep-alive',
  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
 }
 request = urllib2.Request(url, headers=headers)
 response = urllib2.urlopen(request)
 response.encoding = 'utf-8'
 if response.code == 200:
  data = StringIO.StringIO(response.read())
  gzipper = gzip.GzipFile(fileobj=data)
  data = gzipper.read()
  value = paraser(data) # open('E:/h6/haPkY0osd0r5UB.html').read()
  return value
 else:
  pass
 
 
value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser)
for row in value:
 print row

1,lxml.html的方式進行解析,

The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt. It is unique in that it combines the speed and XML feature completeness of these libraries with the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API. The latest release works with all CPython versions from 2.6 to 3.5. See the introduction for more information about background and goals of the lxml project. Some common questions are answered in the FAQ. [官網](http://lxml.de/)

def lxml_parser(page):
 data = []
 doc = etree.HTML(page)
 all_div = doc.xpath('//div[@class="yingping-list-wrap"]')
 for row in all_div:
  # 獲取每一個影評,即影評的item
  all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'})
  for r in all_div_item:
   value = {}
   # 獲取影評的標題部分
   title = r.xpath('.//div[@class="g-clear title-wrap"][1]')
   value['title'] = title[0].xpath('./a/text()')[0]
   value['title_href'] = title[0].xpath('./a/@href')[0]
   score_text = title[0].xpath('./div/span/span/@style')[0]
   score_text = re.search(r'\d+', score_text).group()
   value['score'] = int(score_text) / 20
   # 時間
   value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0]
   # 多少人喜歡
   value['people'] = int(
     re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group())
   data.append(value)
 return data

2,使用BeautifulSoup,不多說了,大家網上找資料看看

def bs4_paraser(html):
 all_value = []
 value = {}
 soup = BeautifulSoup(html, 'html.parser')
 # 獲取影評的部分
 all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1)
 for row in all_div:
  # 獲取每一個影評,即影評的item
  all_div_item = row.find_all('div', attrs={'class': 'item'})
  for r in all_div_item:
   # 獲取影評的標題部分
   title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1)
   if title is not None and len(title) > 0:
    value['title'] = title[0].a.string
    value['title_href'] = title[0].a['href']
    score_text = title[0].div.span.span['style']
    score_text = re.search(r'\d+', score_text).group()
    value['score'] = int(score_text) / 20
    # 時間
    value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string
    # 多少人喜歡
    value['people'] = int(
      re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group())
   # print r
   all_value.append(value)
   value = {}
 return all_value

3,使用SGMLParser,主要是通過start、end tag的方式進行了,解析工程比較明朗,但是有點麻煩,而且該案例的場景不太適合該方法,(哈哈)

class CommentParaser(SGMLParser):
 def __init__(self):
  SGMLParser.__init__(self)
  self.__start_div_yingping = False
  self.__start_div_item = False
  self.__start_div_gclear = False
  self.__start_div_ratingwrap = False
  self.__start_div_num = False
  # a
  self.__start_a = False
  # span 3中狀態
  self.__span_state = 0
  # 數據
  self.__value = {}
  self.data = []
 
 def start_div(self, attrs):
  for k, v in attrs:
   if k == 'class' and v == 'yingping-list-wrap':
    self.__start_div_yingping = True
   elif k == 'class' and v == 'item':
    self.__start_div_item = True
   elif k == 'class' and v == 'g-clear title-wrap':
    self.__start_div_gclear = True
   elif k == 'class' and v == 'rating-wrap g-clear':
    self.__start_div_ratingwrap = True
   elif k == 'class' and v == 'num':
    self.__start_div_num = True
 
 def end_div(self):
  if self.__start_div_yingping:
   if self.__start_div_item:
    if self.__start_div_gclear:
     if self.__start_div_num or self.__start_div_ratingwrap:
      if self.__start_div_num:
       self.__start_div_num = False
      if self.__start_div_ratingwrap:
       self.__start_div_ratingwrap = False
     else:
      self.__start_div_gclear = False
    else:
     self.data.append(self.__value)
     self.__value = {}
     self.__start_div_item = False
   else:
    self.__start_div_yingping = False
 
 def start_a(self, attrs):
  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
   self.__start_a = True
   for k, v in attrs:
    if k == 'href':
     self.__value['href'] = v
 
 def end_a(self):
  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
   self.__start_a = False
 
 def start_span(self, attrs):
  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
   if self.__start_div_ratingwrap:
    if self.__span_state != 1:
     for k, v in attrs:
      if k == 'class' and v == 'rating':
       self.__span_state = 1
      elif k == 'class' and v == 'time':
       self.__span_state = 2
    else:
     for k, v in attrs:
      if k == 'style':
       score_text = re.search(r'\d+', v).group()
     self.__value['score'] = int(score_text) / 20
     self.__span_state = 3
   elif self.__start_div_num:
    self.__span_state = 4
 
 def end_span(self):
  self.__span_state = 0
 
 def handle_data(self, data):
  if self.__start_a:
   self.__value['title'] = data
  elif self.__span_state == 2:
   self.__value['time'] = data
  elif self.__span_state == 4:
   score_text = re.search(r'\d+', data).group()
   self.__value['people'] = int(score_text)
  pass
def sgl_parser(html):
 parser = CommentParaser()
 parser.feed(html)
 return parser.data

4,HTMLParaer,與3原理相識,就是調用的方法不太一樣,基本上可以公用,

class CommentHTMLParser(HTMLParser.HTMLParser):
 def __init__(self):
  HTMLParser.HTMLParser.__init__(self)
  self.__start_div_yingping = False
  self.__start_div_item = False
  self.__start_div_gclear = False
  self.__start_div_ratingwrap = False
  self.__start_div_num = False
  # a
  self.__start_a = False
  # span 3中狀態
  self.__span_state = 0
  # 數據
  self.__value = {}
  self.data = []
 
 def handle_starttag(self, tag, attrs):
  if tag == 'div':
   for k, v in attrs:
    if k == 'class' and v == 'yingping-list-wrap':
     self.__start_div_yingping = True
    elif k == 'class' and v == 'item':
     self.__start_div_item = True
    elif k == 'class' and v == 'g-clear title-wrap':
     self.__start_div_gclear = True
    elif k == 'class' and v == 'rating-wrap g-clear':
     self.__start_div_ratingwrap = True
    elif k == 'class' and v == 'num':
     self.__start_div_num = True
  elif tag == 'a':
   if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
    self.__start_a = True
    for k, v in attrs:
     if k == 'href':
      self.__value['href'] = v
  elif tag == 'span':
   if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
    if self.__start_div_ratingwrap:
     if self.__span_state != 1:
      for k, v in attrs:
       if k == 'class' and v == 'rating':
        self.__span_state = 1
       elif k == 'class' and v == 'time':
        self.__span_state = 2
     else:
      for k, v in attrs:
       if k == 'style':
        score_text = re.search(r'\d+', v).group()
      self.__value['score'] = int(score_text) / 20
      self.__span_state = 3
    elif self.__start_div_num:
     self.__span_state = 4
 
 def handle_endtag(self, tag):
  if tag == 'div':
   if self.__start_div_yingping:
    if self.__start_div_item:
     if self.__start_div_gclear:
      if self.__start_div_num or self.__start_div_ratingwrap:
       if self.__start_div_num:
        self.__start_div_num = False
       if self.__start_div_ratingwrap:
        self.__start_div_ratingwrap = False
      else:
       self.__start_div_gclear = False
     else:
      self.data.append(self.__value)
      self.__value = {}
      self.__start_div_item = False
    else:
     self.__start_div_yingping = False
  elif tag == 'a':
   if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
    self.__start_a = False
  elif tag == 'span':
   self.__span_state = 0
 
 def handle_data(self, data):
  if self.__start_a:
   self.__value['title'] = data
  elif self.__span_state == 2:
   self.__value['time'] = data
  elif self.__span_state == 4:
   score_text = re.search(r'\d+', data).group()
   self.__value['people'] = int(score_text)
  pass
def html_parser(html):
 parser = CommentHTMLParser()
 parser.feed(html)
 return parser.data

關于Python3 中怎么解析html問題的解答就分享到這里了,希望以上內容可以對大家有一定的幫助,如果你還有很多疑惑沒有解開,可以關注億速云行業資訊頻道了解更多相關知識。

向AI問一下細節

免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。

AI

略阳县| 田东县| 明溪县| 滦南县| 尼木县| 荣成市| 新平| 崇信县| 上饶市| 噶尔县| 临高县| 济宁市| 南和县| 兴化市| 安多县| 乌恰县| 疏勒县| 满城县| 如皋市| 桂东县| 福州市| 尉氏县| 永安市| 阜南县| 莱芜市| 游戏| 本溪| 涟源市| 岱山县| 吉林市| 古浪县| 政和县| 台东市| 临颍县| 永昌县| 广安市| 新巴尔虎右旗| 子长县| 弋阳县| 阿坝县| 平陆县|