About BeautifulSoup
Description
- BeautifulSoup的基本元素
- HTML内容遍历方法
- HTML格式化
- 信息标记与内容查找方法
1.BeautifulSoup五种基本元素
import requests
from bs4 import BeautifulSoup
def getText(url):
try:
kv = { 'user-agent': 'Mozilla/5.0' }
r = requests.get(url, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return 'Error'
def solve(txt):
soup = BeautifulSoup(txt, 'html.parser')
print(soup.title)
print(soup.a) # 标签(只返回了第一个标签内容)
print(type(soup.a)) # <class 'bs4.element.Tag'>
print(soup.a.name) # 标签名字
print(type(soup.a.name)) # <class 'str'>
print(soup.a.parent.name)
print(soup.a.parent.parent.name)
print(soup.a.attrs) # 标签属性
print(type(soup.a.attrs)) # <class 'dict'>
print(soup.a.attrs['href']) # 采用字典的方式提取信息
print(soup.a.string)
print(type(soup.a.string)) # <class 'bs4.element.NavigableString'>
newsoup = BeautifulSoup('<b><!--This is a comment--></b><p>This is not a comment</p>', 'html.parser')
print(newsoup.b.string) # This is a comment 自动去掉了注释标志
print(type(newsoup.b.string)) # <class 'bs4.element.Comment'>
print(newsoup.p.string) # This is not a comment
print(type(newsoup.p.string)) # <class 'bs4.element.NavigableString'>
def main():
url = "https://www.csdn.net/"
txt = getText(url)
solve(txt)
if __name__ == '__main__':
main()
2.HTML内容的三种遍历
Code
import requests
from bs4 import BeautifulSoup
def getText(url):
try:
kv = { 'user-agent': 'Mozilla/5.0' }
r = requests.get(url, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return 'Error'
def down(demo):
soup = BeautifulSoup(demo, 'html.parser')
print(soup.head)
print(soup.head.contents) # head标签子节点(包括:标签节点、字符串节点(像'\n'))的列表
print(type(soup.head.contents)) # <class 'list'>
print(soup.body)
print(soup.body.contents)
print(len(soup.body.contents))
print(soup.body.contents[1])
for child in soup.body.children: # 循环遍历子节点(迭代类型只能用在for...in...结构中)
print(child)
for grandchild in soup.body.descendants: # 循环遍历所有子孙节点(迭代类型)
print(grandchild)
def up(demo):
soup = BeautifulSoup(demo, 'html.parser')
for parent in soup.a.parents: # 节点的"先辈"标签,没有则None
if parent is None:
print(parent)
else:
print(parent.name)
print(soup.title.parent) # 节点的父亲标签
print(soup.html.parent) # html为最高标签,父节点为自己
def parallel(demo):
soup = BeautifulSoup(demo, 'html.parser')
print(soup.a.next_sibling)
print(soup.a.next_sibling.next_sibling)
print(soup.a.previous_sibling)
print(soup.a.previous_sibling.previous_sibling)
print(soup.a.parent)
for sibling in soup.a.next_siblings: # 迭代类型,遍历后序节点
print(sibling)
for sibling in soup.a.previous_siblings: # 迭代类型,遍历前序节点
print(sibling)
def main():
url = "https://fireworks99.github.io/"
demo = getText(url)
# down(demo)
# up(demo)
parallel(demo)
if __name__ == '__main__':
main()
3.HTML格式化prettify
import requests
from bs4 import BeautifulSoup
# 爬取网页的通用代码框架
def getText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "Error occurred"
if __name__ == "__main__":
url = "https://fireworks99.github.io/"
demo = getText(url)
soup = BeautifulSoup(demo, 'html.parser')
print(soup.prettify())
print(soup.a.prettify())
4.信息标记与内容查找方法