{"id":249,"date":"2022-02-20T18:21:38","date_gmt":"2022-02-20T10:21:38","guid":{"rendered":"http:\/\/blog.iichen.cn\/?p=249"},"modified":"2022-02-20T18:21:38","modified_gmt":"2022-02-20T10:21:38","slug":"python-beanutiful-soup","status":"publish","type":"post","link":"https:\/\/iichen.cn\/?p=249","title":{"rendered":"Python-Beanutiful Soup"},"content":{"rendered":"<h3>(\u4e00) \u5b89\u88c5<\/h3>\n<pre><code class=\"language-python\">pip install beautifulsoup4\npip install lxml\n\u6216\npip install html5lib<\/code><\/pre>\n<h3>(\u4e8c) Tag<\/h3>\n<pre><code class=\"language-python\">from bs4 import BeautifulSoup\n# \u76f4\u63a5 .\u6807\u7b7e\nbs4.p\n# name\u548cattrs\u4e24\u4e2a\u5c5e\u6027\nbs4.p[\"href\"] <==> bs4.p.get(\"href\")\n# \u83b7\u53d6\u6807\u7b7e\u5185\u5bb9\nbs4.p.string\n<\/code><\/pre>\n<h3>(\u4e09) \u904d\u5386\u6587\u6863\u6811<\/h3>\n<h4>\u76f4\u63a5\u5b50\u8282\u70b9<\/h4>\n<blockquote><p>\n.contents .children \u5c5e\u6027\n<\/p><\/blockquote>\n<pre><code class=\"language-python\"># \u8f93\u51fa\u65b9\u5f0f\u4e3a\u5217\u8868\nprint soup.head.contents\u00a0\n#[<title>The Dormouse's story<\/title>]<\/code><\/pre>\n<pre><code class=\"language-python\"># \u9700\u8981\u8fed\u4ee3\nprint soup.head.children\n#<listiterator object at 0x7f71457f5710>\n\nfor child in  soup.body.children:\n    print child<\/code><\/pre>\n<h4>\u6240\u6709\u5b50\u5b59\u8282\u70b9<\/h4>\n<blockquote><p>\n.descendants \u5c5e\u6027\n<\/p><\/blockquote>\n<pre><code class=\"language-python\">for child in soup.descendants:\n    print child<\/code><\/pre>\n<h4>\u7236\u8282\u70b9\uff0c\u5168\u90e8\u7236\u8282\u70b9<\/h4>\n<p><code> .parent \u5c5e\u6027 .parents \u5c5e\u6027<\/code><\/p>\n<h4>\u5144\u5f1f\u8282\u70b9<\/h4>\n<p><code>.next_sibling .previous_sibling \u5c5e\u6027<\/code><br \/>\n<code>.next_siblings .previous_siblings \u5c5e\u6027<\/code><\/p>\n<h4>\u524d\u540e\u8282\u70b9<\/h4>\n<p><code>.next_element .previous_element \u5c5e\u6027<\/code><br \/>\n<code>.next_elements .previous_elements \u5c5e\u6027<\/code><\/p>\n<h3>(\u56db) \u641c\u7d22\u6587\u6863\u6811<\/h3>\n<p>`find_all( name , attrs , recursive , text , **kwargs )<\/p>\n<pre><code class=\"language-python\"># \u4f20\u6b63\u5219\u8868\u8fbe\u5f0f\nimport re\nfor tag in soup.find_all(re.compile(\"^b\")):\n    print(tag.name)\n# body\n# b\n\n# \u4f20\u5217\u8868 \u5982\u679c\u4f20\u5165\u5217\u8868\u53c2\u6570\uff0cBeautiful Soup \u4f1a\u5c06\u4e0e\u5217\u8868\u4e2d\u4efb\u4e00\u5143\u7d20\u5339\u914d\u7684\u5185\u5bb9\u8fd4\u56de\u3002\u4e0b\u9762\u4ee3\u7801\u627e\u5230\u6587\u6863\u4e2d\u6240\u6709\u6807\u7b7e\u548c\u6807\u7b7e\nsoup.find_all([\"a\", \"b\"])\n# [<b>The Dormouse's story<\/b>,\n#  <a class=\"sister\" href=\"http:\/\/example.com\/elsie\" id=\"link1\">Elsie<\/a>,\n#  <a class=\"sister\" href=\"http:\/\/example.com\/lacie\" id=\"link2\">Lacie<\/a>,\n#  <a class=\"sister\" href=\"http:\/\/example.com\/tillie\" id=\"link3\">Tillie<\/a>]\n\n# \u4f20 True True \u53ef\u4ee5\u5339\u914d\u4efb\u4f55\u503c\uff0c\u4e0b\u9762\u4ee3\u7801\u67e5\u627e\u5230\u6240\u6709\u7684 tag, \u4f46\u662f\u4e0d\u4f1a\u8fd4\u56de\u5b57\u7b26\u4e32\u8282\u70b9\nfor tag in soup.find_all(True):\n    print(tag.name)\n# html\n# head\n# title\n# body\n# p\n\n# keyword \u53c2\u6570\nsoup.find_all(id='link2')\n# [<a class=\"sister\" href=\"http:\/\/example.com\/lacie\" id=\"link2\">Lacie<\/a>]\n\nsoup.find_all(href=re.compile(\"elsie\"))\n# [<a class=\"sister\" href=\"http:\/\/example.com\/elsie\" id=\"link1\">Elsie<\/a>]\n\nsoup.find_all(href=re.compile(\"elsie\"), id='link1')\n# [<a class=\"sister\" href=\"http:\/\/example.com\/elsie\" id=\"link1\">three<\/a>]\n\nsoup.find_all(\"a\", class_=\"sister\")\n# [<a class=\"sister\" href=\"http:\/\/example.com\/elsie\" id=\"link1\">Elsie<\/a>,\n#  <a class=\"sister\" href=\"http:\/\/example.com\/lacie\" id=\"link2\">Lacie<\/a>,\n#  <a class=\"sister\" href=\"http:\/\/example.com\/tillie\" id=\"link3\">Tillie<\/a>]\n\nsoup.find_all(text=\"Elsie\")\n# [u'Elsie']\n\nsoup.find_all(text=[\"Tillie\", \"Elsie\", \"Lacie\"])\n# [u'Elsie', u'Lacie', u'Tillie']\n\nsoup.find_all(text=re.compile(\"Dormouse\"))\n[u\"The Dormouse's story\", u\"The Dormouse's story\"]\n\n# limit \u53c2\u6570 find_all () \u65b9\u6cd5\u8fd4\u56de\u5168\u90e8\u7684\u641c\u7d22\u7ed3\u6784\uff0c\u5982\u679c\u6587\u6863\u6811\u5f88\u5927\u90a3\u4e48\u641c\u7d22\u4f1a\u5f88\u6162\u3002\u5982\u679c\u6211\u4eec\u4e0d\u9700\u8981\u5168\u90e8\u7ed3\u679c\uff0c\u53ef\u4ee5\u4f7f\u7528 limit \u53c2\u6570\u9650\u5236\u8fd4\u56de\u7ed3\u679c\u7684\u6570\u91cf\u3002\nsoup.find_all(\"a\", limit=2)\n# [<a class=\"sister\" href=\"http:\/\/example.com\/elsie\" id=\"link1\">Elsie<\/a>,\n#  <a class=\"sister\" href=\"http:\/\/example.com\/lacie\" id=\"link2\">Lacie<\/a>]\n\n# recursive \u53c2\u6570 \u8c03\u7528 tag \u7684 find_all () \u65b9\u6cd5\u65f6\uff0cBeautiful Soup \u4f1a\u68c0\u7d22\u5f53\u524d tag \u7684\u6240\u6709\u5b50\u5b59\u8282\u70b9\uff0c\u5982\u679c\u53ea\u60f3\u641c\u7d22 tag \u7684\u76f4\u63a5\u5b50\u8282\u70b9\uff0c\u53ef\u4ee5\u4f7f\u7528\u53c2\u6570 recursive=False \nsoup.html.find_all(\"title\")\n# [<title>The Dormouse's story<\/title>]\n\nsoup.html.find_all(\"title\", recursive=False)\n# []\n\n# \u5176\u4ed6\nfind( name , attrs , recursive , text , **kwargs )\nfind_parents() find_parent()\nfind_next_siblings() find_next_sibling()\nfind_previous_siblings() find_previous_sibling()\nfind_all_next() find_next()\nfind_all_previous () \u548c find_previous ()\n<\/code><\/pre>\n<h3>(\u4e94) Css\u9009\u62e9\u5668<\/h3>\n<h4>5.1 \u901a\u8fc7\u6807\u7b7e\u540d\u67e5\u627e<\/h4>\n<pre><code class=\"language-python\">print soup.select('title')\u00a0\n#[<title>The Dormouse's story<\/title>]<\/code><\/pre>\n<h4>5.2 \u901a\u8fc7\u7c7b\u540d\u67e5\u627e<\/h4>\n<pre><code class=\"language-python\">print soup.select('.sister')\n#[<a class=\"sister\" href=\"http:\/\/example.com\/elsie\" id=\"link1\"><!-- Elsie --><\/a>, <a class=\"sister\" href=\"http:\/\/example.com\/lacie\" id=\"link2\">Lacie<\/a>, <a class=\"sister\" href=\"http:\/\/example.com\/tillie\" id=\"link3\">Tillie<\/a>]<\/code><\/pre>\n<h4>5.3 \u901a\u8fc7 id \u540d\u67e5\u627e<\/h4>\n<pre><code class=\"language-python\">print soup.select('#link1')\n#[<a class=\"sister\" href=\"http:\/\/example.com\/elsie\" id=\"link1\"><!-- Elsie --><\/a>]<\/code><\/pre>\n<h4>5.4 \u7ec4\u5408\u67e5\u627e<\/h4>\n<pre><code class=\"language-python\">print soup.select('p #link1')\n#[<a class=\"sister\" href=\"http:\/\/example.com\/elsie\" id=\"link1\"><!-- Elsie --><\/a>]\n\nprint soup.select(\"head > title\")\n#[<title>The Dormouse's story<\/title>]<\/code><\/pre>\n<h4>5.5 \u5c5e\u6027\u67e5\u627e<\/h4>\n<pre><code class=\"language-python\">print soup.select('a[class=\"sister\"]')\n#[<a class=\"sister\" href=\"http:\/\/example.com\/elsie\" id=\"link1\"><!-- Elsie --><\/a>, <a class=\"sister\" href=\"http:\/\/example.com\/lacie\" id=\"link2\">Lacie<\/a>, <a class=\"sister\" href=\"http:\/\/example.com\/tillie\" id=\"link3\">Tillie<\/a>]\n\nprint soup.select('a[href=\"http:\/\/example.com\/elsie\"]')\n#[<a class=\"sister\" href=\"http:\/\/example.com\/elsie\" id=\"link1\"><!-- Elsie --><\/a>]\n\nprint soup.select('p a[href=\"http:\/\/example.com\/elsie\"]')\n#[<a class=\"sister\" href=\"http:\/\/example.com\/elsie\" id=\"link1\"><!-- Elsie --><\/a>]<\/code><\/pre>\n<pre><code class=\"language-python\">soup = BeautifulSoup(html, 'lxml')\nprint type(soup.select('title'))\nprint soup.select('title')[0].get_text()\n\nfor title in soup.select('title'):\n    print title.get_text()<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>(\u4e00) \u5b89\u88c5 pip install beautifulsoup4 pip install lxml \u6216 pi [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[15,8],"tags":[],"class_list":["post-249","post","type-post","status-publish","format-standard","hentry","category-python","category-8"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v24.4 - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ -->\n<title>Python-Beanutiful Soup - IIchen<\/title>\n<meta name=\"description\" content=\"Python-Beanutiful Soup \u57fa\u672c\u6587\u6863\" \/>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/iichen.cn\/?p=249\" \/>\n<meta property=\"og:locale\" content=\"zh_CN\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"Python-Beanutiful Soup - IIchen\" \/>\n<meta property=\"og:description\" content=\"Python-Beanutiful Soup \u57fa\u672c\u6587\u6863\" \/>\n<meta property=\"og:url\" content=\"https:\/\/iichen.cn\/?p=249\" \/>\n<meta property=\"og:site_name\" content=\"IIchen\" \/>\n<meta property=\"article:published_time\" content=\"2022-02-20T10:21:38+00:00\" \/>\n<meta name=\"author\" content=\"iichen\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"\u4f5c\u8005\" \/>\n\t<meta name=\"twitter:data1\" content=\"iichen\" \/>\n\t<meta name=\"twitter:label2\" content=\"\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4\" \/>\n\t<meta name=\"twitter:data2\" content=\"2 \u5206\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\/\/schema.org\",\"@graph\":[{\"@type\":\"Article\",\"@id\":\"https:\/\/iichen.cn\/?p=249#article\",\"isPartOf\":{\"@id\":\"https:\/\/iichen.cn\/?p=249\"},\"author\":{\"name\":\"iichen\",\"@id\":\"https:\/\/iichen.cn\/#\/schema\/person\/4a47edf85ab49841df9e8f6aee40b77c\"},\"headline\":\"Python-Beanutiful Soup\",\"datePublished\":\"2022-02-20T10:21:38+00:00\",\"mainEntityOfPage\":{\"@id\":\"https:\/\/iichen.cn\/?p=249\"},\"wordCount\":15,\"commentCount\":0,\"publisher\":{\"@id\":\"https:\/\/iichen.cn\/#\/schema\/person\/4a47edf85ab49841df9e8f6aee40b77c\"},\"articleSection\":[\"Python\",\"\u7b14\u8bb0\"],\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"CommentAction\",\"name\":\"Comment\",\"target\":[\"https:\/\/iichen.cn\/?p=249#respond\"]}]},{\"@type\":\"WebPage\",\"@id\":\"https:\/\/iichen.cn\/?p=249\",\"url\":\"https:\/\/iichen.cn\/?p=249\",\"name\":\"Python-Beanutiful Soup - IIchen\",\"isPartOf\":{\"@id\":\"https:\/\/iichen.cn\/#website\"},\"datePublished\":\"2022-02-20T10:21:38+00:00\",\"description\":\"Python-Beanutiful Soup \u57fa\u672c\u6587\u6863\",\"breadcrumb\":{\"@id\":\"https:\/\/iichen.cn\/?p=249#breadcrumb\"},\"inLanguage\":\"zh-Hans\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\/\/iichen.cn\/?p=249\"]}]},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\/\/iichen.cn\/?p=249#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"\u9996\u9875\",\"item\":\"https:\/\/iichen.cn\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"Python-Beanutiful Soup\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\/\/iichen.cn\/#website\",\"url\":\"https:\/\/iichen.cn\/\",\"name\":\"IIchen\",\"description\":\"Just do it!\",\"publisher\":{\"@id\":\"https:\/\/iichen.cn\/#\/schema\/person\/4a47edf85ab49841df9e8f6aee40b77c\"},\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\/\/iichen.cn\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"zh-Hans\"},{\"@type\":[\"Person\",\"Organization\"],\"@id\":\"https:\/\/iichen.cn\/#\/schema\/person\/4a47edf85ab49841df9e8f6aee40b77c\",\"name\":\"iichen\",\"image\":{\"@type\":\"ImageObject\",\"inLanguage\":\"zh-Hans\",\"@id\":\"https:\/\/iichen.cn\/#\/schema\/person\/image\/\",\"url\":\"https:\/\/iichen.cn\/wp-content\/uploads\/2025\/01\/avatar.jpg\",\"contentUrl\":\"https:\/\/iichen.cn\/wp-content\/uploads\/2025\/01\/avatar.jpg\",\"width\":940,\"height\":940,\"caption\":\"iichen\"},\"logo\":{\"@id\":\"https:\/\/iichen.cn\/#\/schema\/person\/image\/\"},\"sameAs\":[\"https:\/\/www.iichen.cn\"],\"url\":\"https:\/\/iichen.cn\/?author=1\"}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"Python-Beanutiful Soup - IIchen","description":"Python-Beanutiful Soup \u57fa\u672c\u6587\u6863","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/iichen.cn\/?p=249","og_locale":"zh_CN","og_type":"article","og_title":"Python-Beanutiful Soup - IIchen","og_description":"Python-Beanutiful Soup \u57fa\u672c\u6587\u6863","og_url":"https:\/\/iichen.cn\/?p=249","og_site_name":"IIchen","article_published_time":"2022-02-20T10:21:38+00:00","author":"iichen","twitter_card":"summary_large_image","twitter_misc":{"\u4f5c\u8005":"iichen","\u9884\u8ba1\u9605\u8bfb\u65f6\u95f4":"2 \u5206"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"Article","@id":"https:\/\/iichen.cn\/?p=249#article","isPartOf":{"@id":"https:\/\/iichen.cn\/?p=249"},"author":{"name":"iichen","@id":"https:\/\/iichen.cn\/#\/schema\/person\/4a47edf85ab49841df9e8f6aee40b77c"},"headline":"Python-Beanutiful Soup","datePublished":"2022-02-20T10:21:38+00:00","mainEntityOfPage":{"@id":"https:\/\/iichen.cn\/?p=249"},"wordCount":15,"commentCount":0,"publisher":{"@id":"https:\/\/iichen.cn\/#\/schema\/person\/4a47edf85ab49841df9e8f6aee40b77c"},"articleSection":["Python","\u7b14\u8bb0"],"inLanguage":"zh-Hans","potentialAction":[{"@type":"CommentAction","name":"Comment","target":["https:\/\/iichen.cn\/?p=249#respond"]}]},{"@type":"WebPage","@id":"https:\/\/iichen.cn\/?p=249","url":"https:\/\/iichen.cn\/?p=249","name":"Python-Beanutiful Soup - IIchen","isPartOf":{"@id":"https:\/\/iichen.cn\/#website"},"datePublished":"2022-02-20T10:21:38+00:00","description":"Python-Beanutiful Soup \u57fa\u672c\u6587\u6863","breadcrumb":{"@id":"https:\/\/iichen.cn\/?p=249#breadcrumb"},"inLanguage":"zh-Hans","potentialAction":[{"@type":"ReadAction","target":["https:\/\/iichen.cn\/?p=249"]}]},{"@type":"BreadcrumbList","@id":"https:\/\/iichen.cn\/?p=249#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"\u9996\u9875","item":"https:\/\/iichen.cn\/"},{"@type":"ListItem","position":2,"name":"Python-Beanutiful Soup"}]},{"@type":"WebSite","@id":"https:\/\/iichen.cn\/#website","url":"https:\/\/iichen.cn\/","name":"IIchen","description":"Just do it!","publisher":{"@id":"https:\/\/iichen.cn\/#\/schema\/person\/4a47edf85ab49841df9e8f6aee40b77c"},"potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/iichen.cn\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"zh-Hans"},{"@type":["Person","Organization"],"@id":"https:\/\/iichen.cn\/#\/schema\/person\/4a47edf85ab49841df9e8f6aee40b77c","name":"iichen","image":{"@type":"ImageObject","inLanguage":"zh-Hans","@id":"https:\/\/iichen.cn\/#\/schema\/person\/image\/","url":"https:\/\/iichen.cn\/wp-content\/uploads\/2025\/01\/avatar.jpg","contentUrl":"https:\/\/iichen.cn\/wp-content\/uploads\/2025\/01\/avatar.jpg","width":940,"height":940,"caption":"iichen"},"logo":{"@id":"https:\/\/iichen.cn\/#\/schema\/person\/image\/"},"sameAs":["https:\/\/www.iichen.cn"],"url":"https:\/\/iichen.cn\/?author=1"}]}},"_links":{"self":[{"href":"https:\/\/iichen.cn\/index.php?rest_route=\/wp\/v2\/posts\/249","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/iichen.cn\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/iichen.cn\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/iichen.cn\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/iichen.cn\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=249"}],"version-history":[{"count":0,"href":"https:\/\/iichen.cn\/index.php?rest_route=\/wp\/v2\/posts\/249\/revisions"}],"wp:attachment":[{"href":"https:\/\/iichen.cn\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=249"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/iichen.cn\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=249"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/iichen.cn\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=249"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}