{"id":12762,"date":"2023-11-16T19:37:24","date_gmt":"2023-11-16T10:37:24","guid":{"rendered":"http:\/\/www.tyosuke20xx.com\/blog\/?p=12762"},"modified":"2023-11-16T19:37:26","modified_gmt":"2023-11-16T10:37:26","slug":"python-webcrawler","status":"publish","type":"post","link":"http:\/\/www.tyosuke20xx.com\/blog\/?p=12762","title":{"rendered":"Python Webcrawler"},"content":{"rendered":"\n<p>import requests<br>from bs4 import BeautifulSoup<br><br>def crawl(url, max_depth=2):<br>if max_depth &lt; 0:<br>return<br><br>try:<br>response = requests.get(url)<br>content = response.content<br>soup = BeautifulSoup(content, &#8216;html.parser&#8217;)<br><br>links = set()<br><br>for link in soup.find_all(&#8216;a&#8217;):<br>href = link.get(&#8216;href&#8217;)<br>if href and href.startswith(&#8216;http&#8217;):<br>links.add(href)<br><br>print(f&#8221;Found {len(links)} links at {url}&#8221;)<br><br>for link in links:<br>crawl(link, max_depth &#8211; 1)<br><br>except requests.RequestException as e:<br>print(f&#8221;Error during requests to {url} : {str(e)}&#8221;)<br><br># \u4f7f\u7528\u4f8b<br>start_url = &#8220;<a href=\"https:\/\/b.hatena.ne.jp\/\" rel=\"noreferrer noopener\" target=\"_blank\">https:\/\/<strong>b.hatena.ne.jp<\/strong>\/<em><\/em><\/a>&#8221; # \u30b9\u30bf\u30fc\u30c8\u3059\u308bURL<br>crawl(start_url, max_depth=2) # \u6df1\u30552\u3067\u30af\u30ed\u30fc\u30eb<\/p>\n\n\n\n<p><a href=\"https:\/\/b.hatena.ne.jp\/\" rel=\"noreferrer noopener\" target=\"_blank\"><\/a><a href=\"https:\/\/b.hatena.ne.jp\/\" rel=\"noreferrer noopener\" target=\"_blank\"><\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<p>import requestsfrom bs4 import BeautifulSoup def crawl(url, max_depth=2):if max_depth &lt; 0:return try:respon &hellip; <a href=\"http:\/\/www.tyosuke20xx.com\/blog\/?p=12762\" class=\"more-link\"><span class=\"screen-reader-text\">&#8220;Python Webcrawler&#8221; \u306e<\/span>\u7d9a\u304d\u3092\u8aad\u3080<\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"_monsterinsights_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"_uf_show_specific_survey":0,"_uf_disable_surveys":false,"footnotes":""},"categories":[4,65,6],"tags":[39,17],"class_list":["post-12762","post","type-post","status-publish","format-standard","hentry","category-programming","category-python","category-web","tag-python","tag-web"],"aioseo_notices":[],"jetpack_featured_media_url":"","_links":{"self":[{"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=\/wp\/v2\/posts\/12762","targetHints":{"allow":["GET"]}}],"collection":[{"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=12762"}],"version-history":[{"count":1,"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=\/wp\/v2\/posts\/12762\/revisions"}],"predecessor-version":[{"id":12763,"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=\/wp\/v2\/posts\/12762\/revisions\/12763"}],"wp:attachment":[{"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=12762"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=12762"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=12762"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}