{"id":25689,"date":"2024-11-24T16:49:29","date_gmt":"2024-11-24T07:49:29","guid":{"rendered":"http:\/\/www.tyosuke20xx.com\/blog\/?p=25689"},"modified":"2025-01-06T17:57:36","modified_gmt":"2025-01-06T08:57:36","slug":"%e3%80%90python%e3%80%91%e3%82%af%e3%83%ad%e3%83%bc%e3%83%a9%e3%83%bc%e5%9e%8b%e6%a4%9c%e7%b4%a2%e3%82%a8%e3%83%b3%e3%82%b8%e3%83%b3","status":"publish","type":"post","link":"http:\/\/www.tyosuke20xx.com\/blog\/?p=25689","title":{"rendered":"\u3010python\u3011\u30af\u30ed\u30fc\u30e9\u30fc\u578b\u691c\u7d22\u30a8\u30f3\u30b8\u30f3"},"content":{"rendered":"\n<pre class=\"wp-block-code\"><code>import requests\nfrom bs4 import BeautifulSoup\nfrom urllib.parse import urljoin, urlparse\nimport sqlite3\nimport threading\nimport queue\nimport tkinter as tk\nfrom tkinter import ttk, messagebox\n\n# \u30c7\u30fc\u30bf\u30d9\u30fc\u30b9\u8a2d\u5b9a\nDB_NAME = \"gui_search_engine.db\"\nconn = sqlite3.connect(DB_NAME, check_same_thread=False)\ncursor = conn.cursor()\n\n# \u30c6\u30fc\u30d6\u30eb\u4f5c\u6210\ncursor.execute(\"\"\"\nCREATE TABLE IF NOT EXISTS pages (\n    id INTEGER PRIMARY KEY AUTOINCREMENT,\n    url TEXT UNIQUE,\n    title TEXT,\n    description TEXT,\n    content TEXT\n)\n\"\"\")\nconn.commit()\n\n# \u30b0\u30ed\u30fc\u30d0\u30eb\u5909\u6570\nvisited = set()\nvisited_lock = threading.Lock()\ntask_queue = queue.Queue()\nMAX_THREADS = 5\n\n# \u30da\u30fc\u30b8\u3092\u30c7\u30fc\u30bf\u30d9\u30fc\u30b9\u306b\u4fdd\u5b58\ndef save_page_to_db(url, title, description, content):\n    try:\n        cursor.execute(\"INSERT INTO pages (url, title, description, content) VALUES (?, ?, ?, ?)\",\n                       (url, title, description, content))\n        conn.commit()\n    except sqlite3.IntegrityError:\n        pass  # URL\u91cd\u8907\u6642\u306f\u7121\u8996\n\n# URL\u6b63\u898f\u5316\ndef normalize_url(base, link):\n    return urljoin(base, link).split('#')&#91;0]\n\n# \u30af\u30ed\u30fc\u30e9\u30fc\ndef crawl(url, domain, status_label):\n    with visited_lock:\n        if url in visited:\n            return\n        visited.add(url)\n\n    try:\n        response = requests.get(url, timeout=10)\n        soup = BeautifulSoup(response.content, \"html.parser\")\n\n        # \u30e1\u30bf\u30c7\u30fc\u30bf\u53ce\u96c6\n        title = soup.title.string if soup.title else \"No Title\"\n        description_tag = soup.find(\"meta\", attrs={\"name\": \"description\"})\n        description = description_tag&#91;\"content\"] if description_tag else \"No Description\"\n        content = soup.get_text()\n\n        # \u30c7\u30fc\u30bf\u30d9\u30fc\u30b9\u306b\u4fdd\u5b58\n        save_page_to_db(url, title, description, content)\n\n        # \u30b9\u30c6\u30fc\u30bf\u30b9\u66f4\u65b0\n        status_label.config(text=f\"Crawling: {url}\")\n\n        # \u6b21\u306eURL\u3092\u53ce\u96c6\n        for link in soup.find_all('a', href=True):\n            full_url = normalize_url(url, link&#91;'href'])\n            if urlparse(full_url).netloc == domain:\n                task_queue.put(full_url)\n    except Exception as e:\n        print(f\"Error crawling {url}: {e}\")\n\n# \u4e26\u5217\u30af\u30ed\u30fc\u30ea\u30f3\u30b0\ndef start_crawling(start_url, domain, status_label):\n    visited.clear()\n    task_queue.put(start_url)\n\n    def worker():\n        while not task_queue.empty():\n            url = task_queue.get()\n            crawl(url, domain, status_label)\n        status_label.config(text=\"Crawling Complete\")\n\n    threads = &#91;]\n    for _ in range(MAX_THREADS):\n        thread = threading.Thread(target=worker)\n        threads.append(thread)\n        thread.start()\n    for thread in threads:\n        thread.join()\n\n# \u691c\u7d22\u6a5f\u80fd\ndef search(query, results_box):\n    cursor.execute(\"SELECT url, title, description FROM pages WHERE content LIKE ?\", (f\"%{query}%\",))\n    results = cursor.fetchall()\n    results_box.delete(*results_box.get_children())  # \u7d50\u679c\u3092\u30af\u30ea\u30a2\n\n    if not results:\n        messagebox.showinfo(\"\u691c\u7d22\u7d50\u679c\", \"\u8a72\u5f53\u3059\u308b\u7d50\u679c\u306f\u3042\u308a\u307e\u305b\u3093\u3067\u3057\u305f\u3002\")\n    else:\n        for url, title, description in results:\n            results_box.insert(\"\", \"end\", values=(title, url, description&#91;:100]))\n\n# GUI\u8a2d\u8a08\ndef create_gui():\n    root = tk.Tk()\n    root.title(\"\u30af\u30ed\u30fc\u30e9\u30fc\u578b\u691c\u7d22\u30a8\u30f3\u30b8\u30f3\")\n\n    # \u30d5\u30ec\u30fc\u30e0\u69cb\u6210\n    frame_crawl = ttk.Frame(root, padding=10)\n    frame_crawl.grid(row=0, column=0, sticky=\"ew\")\n    frame_search = ttk.Frame(root, padding=10)\n    frame_search.grid(row=1, column=0, sticky=\"nsew\")\n\n    # \u30af\u30ed\u30fc\u30ea\u30f3\u30b0\u30bb\u30af\u30b7\u30e7\u30f3\n    ttk.Label(frame_crawl, text=\"\u30b9\u30bf\u30fc\u30c8URL:\").grid(row=0, column=0, padx=5, pady=5, sticky=\"w\")\n    url_entry = ttk.Entry(frame_crawl, width=50)\n    url_entry.grid(row=0, column=1, padx=5, pady=5, sticky=\"w\")\n\n    status_label = ttk.Label(frame_crawl, text=\"Status: Ready\", foreground=\"blue\")\n    status_label.grid(row=1, column=0, columnspan=2, padx=5, pady=5, sticky=\"w\")\n\n    def on_crawl():\n        start_url = url_entry.get().strip()\n        if not start_url:\n            messagebox.showerror(\"Error\", \"\u30b9\u30bf\u30fc\u30c8URL\u3092\u5165\u529b\u3057\u3066\u304f\u3060\u3055\u3044\u3002\")\n            return\n\n        # \u30b9\u30ad\u30fc\u30e0\u88dc\u5b8c\n        if not start_url.startswith((\"http:\/\/\", \"https:\/\/\")):\n            start_url = \"https:\/\/\" + start_url\n\n        # \u30c9\u30e1\u30a4\u30f3\u3092\u53d6\u5f97\n        domain = urlparse(start_url).netloc\n        status_label.config(text=\"Crawling in Progress...\")\n        threading.Thread(target=start_crawling, args=(start_url, domain, status_label)).start()\n\n    ttk.Button(frame_crawl, text=\"\u30af\u30ed\u30fc\u30ea\u30f3\u30b0\u958b\u59cb\", command=on_crawl).grid(row=0, column=2, padx=5, pady=5)\n\n    # \u691c\u7d22\u30bb\u30af\u30b7\u30e7\u30f3\n    ttk.Label(frame_search, text=\"\u691c\u7d22\u30af\u30a8\u30ea:\").grid(row=0, column=0, padx=5, pady=5, sticky=\"w\")\n    query_entry = ttk.Entry(frame_search, width=30)\n    query_entry.grid(row=0, column=1, padx=5, pady=5, sticky=\"w\")\n\n    results_box = ttk.Treeview(frame_search, columns=(\"Title\", \"URL\", \"Description\"), show=\"headings\")\n    results_box.heading(\"Title\", text=\"\u30bf\u30a4\u30c8\u30eb\")\n    results_box.heading(\"URL\", text=\"URL\")\n    results_box.heading(\"Description\", text=\"\u8aac\u660e\")\n    results_box.grid(row=1, column=0, columnspan=3, padx=5, pady=5, sticky=\"nsew\")\n\n    def on_search():\n        query = query_entry.get().strip()\n        if not query:\n            messagebox.showerror(\"Error\", \"\u691c\u7d22\u30af\u30a8\u30ea\u3092\u5165\u529b\u3057\u3066\u304f\u3060\u3055\u3044\u3002\")\n            return\n        search(query, results_box)\n\n    ttk.Button(frame_search, text=\"\u691c\u7d22\", command=on_search).grid(row=0, column=2, padx=5, pady=5)\n\n    # \u30a6\u30a3\u30f3\u30c9\u30a6\u30b5\u30a4\u30ba\u8abf\u6574\n    root.columnconfigure(0, weight=1)\n    frame_search.rowconfigure(1, weight=1)\n\n    root.mainloop()\n\n# GUI\u8d77\u52d5\nif __name__ == \"__main__\":\n    create_gui()\n\n    # \u30c7\u30fc\u30bf\u30d9\u30fc\u30b9\u63a5\u7d9a\u3092\u9589\u3058\u308b\n    conn.close()<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"_monsterinsights_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"_uf_show_specific_survey":0,"_uf_disable_surveys":false,"footnotes":""},"categories":[4,65],"tags":[3,39],"class_list":["post-25689","post","type-post","status-publish","format-standard","hentry","category-programming","category-python","tag-programming","tag-python"],"aioseo_notices":[],"jetpack_featured_media_url":"","_links":{"self":[{"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=\/wp\/v2\/posts\/25689","targetHints":{"allow":["GET"]}}],"collection":[{"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=25689"}],"version-history":[{"count":3,"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=\/wp\/v2\/posts\/25689\/revisions"}],"predecessor-version":[{"id":25776,"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=\/wp\/v2\/posts\/25689\/revisions\/25776"}],"wp:attachment":[{"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=25689"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=25689"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.tyosuke20xx.com\/blog\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=25689"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}