Python-Beanutiful Soup

本文最后更新于 1237 天前，其中的信息可能已经有所发展或是发生改变。

(一) 安装

pip install beautifulsoup4
pip install lxml
或
pip install html5lib

(二) Tag

from bs4 import BeautifulSoup
# 直接 .标签
bs4.p
# name和attrs两个属性
bs4.p["href"] <==> bs4.p.get("href")
# 获取标签内容
bs4.p.string

(三) 遍历文档树

直接子节点

.contents .children 属性

# 输出方式为列表
print soup.head.contents 
#[The Dormouse's story]

# 需要迭代
print soup.head.children
#

for child in  soup.body.children:
    print child

所有子孙节点

.descendants 属性

for child in soup.descendants:
    print child

父节点，全部父节点

.parent 属性 .parents 属性

兄弟节点

.next_sibling .previous_sibling 属性
.next_siblings .previous_siblings 属性

前后节点

.next_element .previous_element 属性
.next_elements .previous_elements 属性

(四) 搜索文档树

`find_all( name , attrs , recursive , text , **kwargs )

# 传正则表达式
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)
# body
# b

# 传列表 如果传入列表参数，Beautiful Soup 会将与列表中任一元素匹配的内容返回。下面代码找到文档中所有标签和标签
soup.find_all(["a", "b"])
# [The Dormouse's story,
#  Elsie,
#  Lacie,
#  Tillie]

# 传 True True 可以匹配任何值，下面代码查找到所有的 tag, 但是不会返回字符串节点
for tag in soup.find_all(True):
    print(tag.name)
# html
# head
# title
# body
# p

# keyword 参数
soup.find_all(id='link2')
# [Lacie]

soup.find_all(href=re.compile("elsie"))
# [Elsie]

soup.find_all(href=re.compile("elsie"), id='link1')
# [three]

soup.find_all("a", class_="sister")
# [Elsie,
#  Lacie,
#  Tillie]

soup.find_all(text="Elsie")
# [u'Elsie']

soup.find_all(text=["Tillie", "Elsie", "Lacie"])
# [u'Elsie', u'Lacie', u'Tillie']

soup.find_all(text=re.compile("Dormouse"))
[u"The Dormouse's story", u"The Dormouse's story"]

# limit 参数 find_all () 方法返回全部的搜索结构，如果文档树很大那么搜索会很慢。如果我们不需要全部结果，可以使用 limit 参数限制返回结果的数量。
soup.find_all("a", limit=2)
# [Elsie,
#  Lacie]

# recursive 参数 调用 tag 的 find_all () 方法时，Beautiful Soup 会检索当前 tag 的所有子孙节点，如果只想搜索 tag 的直接子节点，可以使用参数 recursive=False 
soup.html.find_all("title")
# [The Dormouse's story]

soup.html.find_all("title", recursive=False)
# []

# 其他
find( name , attrs , recursive , text , **kwargs )
find_parents() find_parent()
find_next_siblings() find_next_sibling()
find_previous_siblings() find_previous_sibling()
find_all_next() find_next()
find_all_previous () 和 find_previous ()

(五) Css选择器

5.1 通过标签名查找

print soup.select('title') 
#[The Dormouse's story]

5.2 通过类名查找

print soup.select('.sister')
#[, Lacie, Tillie]

5.3 通过 id 名查找

print soup.select('#link1')
#[]

5.4 组合查找

print soup.select('p #link1')
#[]

print soup.select("head > title")
#[The Dormouse's story]

5.5 属性查找

print soup.select('a[class="sister"]')
#[, Lacie, Tillie]

print soup.select('a[href="http://example.com/elsie"]')
#[]

print soup.select('p a[href="http://example.com/elsie"]')
#[]

soup = BeautifulSoup(html, 'lxml')
print type(soup.select('title'))
print soup.select('title')[0].get_text()

for title in soup.select('title'):
    print title.get_text()