avatar
fireworks99
keep hungry keep foolish

About BeautifulSoup

Description

  1. BeautifulSoup的基本元素
  2. HTML内容遍历方法
  3. HTML格式化
  4. 信息标记与内容查找方法

1.BeautifulSoup五种基本元素

Five elements

import requests from bs4 import BeautifulSoup def getText(url): try: kv = { 'user-agent': 'Mozilla/5.0' } r = requests.get(url, headers=kv) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return 'Error' def solve(txt): soup = BeautifulSoup(txt, 'html.parser') print(soup.title) print(soup.a) # 标签(只返回了第一个标签内容) print(type(soup.a)) # <class 'bs4.element.Tag'> print(soup.a.name) # 标签名字 print(type(soup.a.name)) # <class 'str'> print(soup.a.parent.name) print(soup.a.parent.parent.name) print(soup.a.attrs) # 标签属性 print(type(soup.a.attrs)) # <class 'dict'> print(soup.a.attrs['href']) # 采用字典的方式提取信息 print(soup.a.string) print(type(soup.a.string)) # <class 'bs4.element.NavigableString'> newsoup = BeautifulSoup('<b><!--This is a comment--></b><p>This is not a comment</p>', 'html.parser') print(newsoup.b.string) # This is a comment 自动去掉了注释标志 print(type(newsoup.b.string)) # <class 'bs4.element.Comment'> print(newsoup.p.string) # This is not a comment print(type(newsoup.p.string)) # <class 'bs4.element.NavigableString'> def main(): url = "https://www.csdn.net/" txt = getText(url) solve(txt) if __name__ == '__main__': main()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47

2.HTML内容的三种遍历

下行遍历

上行遍历

平行遍历

Code

import requests from bs4 import BeautifulSoup def getText(url): try: kv = { 'user-agent': 'Mozilla/5.0' } r = requests.get(url, headers=kv) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return 'Error' def down(demo): soup = BeautifulSoup(demo, 'html.parser') print(soup.head) print(soup.head.contents) # head标签子节点(包括:标签节点、字符串节点(像'\n'))的列表 print(type(soup.head.contents)) # <class 'list'> print(soup.body) print(soup.body.contents) print(len(soup.body.contents)) print(soup.body.contents[1]) for child in soup.body.children: # 循环遍历子节点(迭代类型只能用在for...in...结构中) print(child) for grandchild in soup.body.descendants: # 循环遍历所有子孙节点(迭代类型) print(grandchild) def up(demo): soup = BeautifulSoup(demo, 'html.parser') for parent in soup.a.parents: # 节点的"先辈"标签,没有则None if parent is None: print(parent) else: print(parent.name) print(soup.title.parent) # 节点的父亲标签 print(soup.html.parent) # html为最高标签,父节点为自己 def parallel(demo): soup = BeautifulSoup(demo, 'html.parser') print(soup.a.next_sibling) print(soup.a.next_sibling.next_sibling) print(soup.a.previous_sibling) print(soup.a.previous_sibling.previous_sibling) print(soup.a.parent) for sibling in soup.a.next_siblings: # 迭代类型,遍历后序节点 print(sibling) for sibling in soup.a.previous_siblings: # 迭代类型,遍历前序节点 print(sibling) def main(): url = "https://fireworks99.github.io/" demo = getText(url) # down(demo) # up(demo) parallel(demo) if __name__ == '__main__': main()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64

3.HTML格式化prettify

import requests from bs4 import BeautifulSoup # 爬取网页的通用代码框架 def getText(url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "Error occurred" if __name__ == "__main__": url = "https://fireworks99.github.io/" demo = getText(url) soup = BeautifulSoup(demo, 'html.parser') print(soup.prettify()) print(soup.a.prettify())
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20

4.信息标记与内容查找方法

XML

JSON

YAML

compare

查找

参数

替代

Site by Baole Zhao | Powered by Hexo | theme PreciousJoy