BeautifulSoup介绍
Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间.
BeautifulSoup入门
from bs4 import BeautifulSoup import requests r = requests.get("http://python123.io/ws/demo.html") #print(r.text) demo = r.text soup = BeautifulSoup(demo,"html.parser") #输出网页全部内容 #print(soup.prettify()) #截取网页中的title标签 print(soup.title) #截取网页中的a标签 print(soup.a) #看a标签的父标签 print(soup.a.parent.name) #看a标签的父标签的父标签 print(soup.a.parent.parent.name) tag = soup.a print(tag) #查看标签的各个属性信息 print(tag.attrs) #如果获取某个属性的值 print(tag.attrs['class']) #获取标签当中的文本信息 print(soup.a.string) print(soup.p.string)
BeautifulSoup遍历方法
- 向下遍历
from bs4 import BeautifulSoup import requests r = requests.get("http://python123.io/ws/demo.html") demo = r.text soup = BeautifulSoup(demo,"html.parser") #查看head的内容 print(soup.head) #查看head的儿子节点 print(soup.head.contents) #查看body的儿子节点 print(soup.body.contents) #查看儿子节点的个数 print(len(soup.body.contents)) #查看某个儿子节点 print(soup.body.contents[1]) #标签树的下行儿子遍历 for child in soup.body.children: print(child) #标签树的下行所有遍历 print("____________________") for child in soup.body.descendants: print(child)
- 上行遍历
from bs4 import BeautifulSoup import requests r = requests.get("http://python123.io/ws/demo.html") demo = r.text soup = BeautifulSoup(demo,"html.parser") print(soup.html.parent) #标签树的上行遍历 for parent in soup.a.parents: if parent is None: print(parent) else: print(parent.name)
- 平行遍历
#平行遍历都必须发生在同一个父节点下个各个节点 from bs4 import BeautifulSoup import requests r = requests.get("http://python123.io/ws/demo.html") demo = r.text soup = BeautifulSoup(demo,"html.parser") #a标签的下一个平行标签 print(soup.a.next_sibling) #a标签的下一个平行标签的下一个平行标签 print(soup.a.next_sibling.next_sibling) #a标签的前一个平行标签 print(soup.a.previous_sibling) #检测a标签的父标签 print(soup.a.parent) #遍历后续节点 print("===================") for sibling in soup.a.next_siblings: print(sibling) print("===================") #遍历前续节点 for sibling in soup.a.previous_siblings: print(sibling)
BeautifulSoup查找方法
- 查找方法(一)
from bs4 import BeautifulSoup import requests import re r = requests.get("http://python123.io/ws/demo.html") demo = r.text soup = BeautifulSoup(demo,"html.parser") #查找所有的a标签 print(soup.find_all('a')) #可以在里面放列表类型 print(soup.find_all(['a','b'])) #如果里面放的是True,就是找所有的标签 for tag in soup.find_all(True): print(tag.name) #如果想找所有b开头的标签 for tag in soup.find_all(re.compile('b')): print(tag.name)
- 查找方法(二)
from bs4 import BeautifulSoup import requests import re r = requests.get("http://python123.io/ws/demo.html") demo = r.text soup = BeautifulSoup(demo,"html.parser") #找到p标签中带有course值的标签 print(soup.find_all('p','course')) #找到id='link1' (查找标签域中) print(soup.find_all(id='link1')) #如果写id值中为link的 (赋值必须精确) print(soup.find_all(id='link')) #比如检索到所有id中包括link的 print(soup.find_all(id=re.compile('link'))) #recursise是否对子孙全部检索,默认为True print(soup.find_all('a')) print(soup.find_all('a',recursive=False)) #检索字符串(在字符串域) print(soup.find_all(string = 'Basic Python')) #检索到更多 print(soup.find_all(string = re.compile('Python')))
BeautifulSoup–HTML格式化和编码
from bs4 import BeautifulSoup import requests r = requests.get("http://python123.io/ws/demo.html") print(r.text) demo = r.text soup = BeautifulSoup(demo,"html.parser") #输出网页全部内容(以一种更加友好的方式) print(soup.prettify()) #对单个信息的处理(a标签) print(soup.a.prettify())
声明:我的博客即将同步至腾讯云+社区,邀请大家一同入驻:https://cloud.tencent.com/developer/support-plan?invite_code=23z6dnotw0skk
评论区