当前位置：首页 > news >正文

基于selenium的网页自动搜索

news 2025/10/10 22:00:06

第一节

通过简单的百度网页打开学习selenium库的基本功能。

 1 from selenium import webdriver
 2 from selenium.webdriver.chrome.service import Service
 3 from selenium.webdriver.chrome.options import Options
 4 from selenium.webdriver.common.by import By
 5 from selenium.webdriver.support.ui import WebDriverWait
 6 from selenium.webdriver.support import expected_conditions as EC
 7 import time
 8 
 9 from selenium.webdriver.common.keys import Keys
10 
11 # 配置 ChromeDriver 路径，替换为你的 ChromeDriver 路径，你也可以将chromedriver拖入文件根目录，使用'./chromedriver.exe'路径。
12 chrome_driver_path = 'E:/Git_repo/VSCode_tmp/autopython/chromedriver-win64/chromedriver.exe'  # 替换为你的 ChromeDriver 路径
13 
14 # 初始化 ChromeDriver Service
15 service = Service(chrome_driver_path)
16 # 打开浏览器时的相关配置，可以根据需求进行打开和关闭
17 options = Options()
18 options.add_argument("--start-maximized")  # 启动时最大化窗口
19 options.add_argument("--disable-blink-features=AutomationControlled")  # 使浏览器不显示自动化控制的信息
20 # options.add_argument("--disable-gpu")  # 禁用GPU硬件加速
21 options.add_argument("--disable-infobars")  # 隐藏信息栏
22 # options.add_argument("--disable-extensions")  # 禁用所有扩展程序
23 # options.add_argument("--disable-popup-blocking")  # 禁用弹出窗口拦截
24 # options.add_argument("--incognito")  # 启动无痕模式
25 # options.add_argument("--no-sandbox")  # 关闭沙盒模式（提高性能）
26 # options.add_argument("--disable-dev-shm-usage")  # 使用/dev/shm分区以避免共享内存问题
27 # options.add_argument("--remote-debugging-port=9222")  # 启用远程调试端口
28 options.add_experimental_option("excludeSwitches", ["enable-automation"])
29 options.add_experimental_option("useAutomationExtension", False)
30 
31 
32 # 初始化 WebDriver，并传入 ChromeDriver Service
33 driver = webdriver.Chrome(service=service, options=options)
34 # # 防反爬：隐藏 webdriver 特征
35 # driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => false});")
36 
37 try:
38     # 打开百度搜索首页
39     url = "https://www.baidu.com"
40     driver.get(url)
41     # 显式等待，直到搜索框出现
42     wait = WebDriverWait(driver, 10)
43     # 通过Xpath表达式获取到百度输入框的html元素，方法也提供By.ID和By.CLASS_NAME的方法，用户可自动尝试
44     search_box = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='chat-textarea']")))
45     # 输入搜索关键词
46     search_box.send_keys("原神"+ Keys.RETURN)
47     # 模拟按下回车键进行搜索
48     #search_box.submit()
49     # 等待搜索结果加载完成
50     wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='content_left']")))
51     # 打印搜索结果的标题
52     print(driver.title)
53     # 延时五秒展示结果
54     time.sleep(5)
55 
56 finally:
57     # 关闭 WebDriver
58     driver.quit()

代码主要分为五大块，第一块为导入selenium库驱动网页所需要的包和驱动chrome浏览器的chrome_driver；第二块为初始化ChromeDriver Service主要为控制如何打开浏览器；第三块为初始化 WebDriver，并传入 ChromeDriver Service；第五块为具体打开网页和处理网页按钮的方法，这里使用了XPATH方式寻找网页元素。

第二节

要想实现网页自动化登录最简单的方法是使用cookie方式登录，通过保存之前登录网页的cookie实现网页的自动化登录，而且还可以避免使用自动化工具打开浏览器无法保存登录信息的问题。

 1 from selenium import webdriver
 2 from selenium.webdriver.chrome.service import Service
 3 from selenium.webdriver.chrome.options import Options
 4 import os
 5 import json
 6 import time
 7 
 8 # 配置 ChromeDriver 路径，替换为你的 ChromeDriver 路径，你也可以将chromedriver拖入文件根目录，使用'./chromedriver.exe'路径。
 9 chrome_driver_path = 'E:/Git_repo/VSCode_tmp/autopython/chromedriver-win64/chromedriver.exe'  # 替换为你的 ChromeDriver 路径
10 
11 # 初始化 ChromeDriver Service
12 service = Service(chrome_driver_path)
13 options = Options()
14 options.add_argument("--disable-blink-features=AutomationControlled")  # 使浏览器不显示自动化控制的信息
15 
16 # 初始化 WebDriver，并传入 ChromeDriver Service
17 driver = webdriver.Chrome(service=service, options=options)
18 
19 try:
20     # 打开小红书首页
21     url = "https://www.xiaohongshu.com"
22     driver.get(url)
23     time.sleep(40)  # 等待页面加载完成
24 
25     # 获取 cookies
26     cookies = driver.get_cookies()
27 
28     # 打印当前工作目录
29     print("Current working directory: ", os.getcwd())
30 
31 
32     # 将 cookies 保存到文件
33     with open("cookies.json", "w") as f:
34         json.dump(cookies, f)
35     print("Cookies have been saved to cookies.json")
36 finally:
37     # 关闭 WebDriver
38     driver.quit()

上面程序通过手动登录网页而后再保存网页cookie用于之后的自动化登录。

第三节

通过上面的保存的cookie，编写程序实现网页自动登录并实现指定内容搜索。

  1 from selenium import webdriver
  2 from selenium.webdriver.chrome.service import Service
  3 from selenium.webdriver.chrome.options import Options
  4 from selenium.webdriver.common.by import By
  5 from selenium.webdriver.support.ui import WebDriverWait
  6 from selenium.webdriver.support import expected_conditions as EC
  7 
  8 from selenium.common.exceptions import NoSuchElementException
  9 import random
 10 import os
 11 import time
 12 import json
 13 
 14 
 15 from selenium.webdriver.common.keys import Keys
 16 
 17 # 配置 ChromeDriver 路径，替换为你的 ChromeDriver 路径，你也可以将chromedriver拖入文件根目录，使用'./chromedriver.exe'路径。
 18 chrome_driver_path = 'E:/Git_repo/VSCode_tmp/autopython/chromedriver-win64/chromedriver.exe'  # 替换为你的 ChromeDriver 路径
 19 
 20 # 初始化 ChromeDriver Service
 21 service = Service(chrome_driver_path)
 22 # 打开浏览器时的相关配置，可以根据需求进行打开和关闭
 23 options = Options()
 24 options.add_argument("--start-maximized")  # 启动时最大化窗口
 25 options.add_argument("--disable-blink-features=AutomationControlled")  # 使浏览器不显示自动化控制的信息
 26 # options.add_argument("--disable-gpu")  # 禁用GPU硬件加速
 27 options.add_argument("--disable-infobars")  # 隐藏信息栏
 28 # options.add_argument("--disable-extensions")  # 禁用所有扩展程序
 29 # options.add_argument("--disable-popup-blocking")  # 禁用弹出窗口拦截
 30 # options.add_argument("--incognito")  # 启动无痕模式
 31 # options.add_argument("--no-sandbox")  # 关闭沙盒模式（提高性能）
 32 # options.add_argument("--disable-dev-shm-usage")  # 使用/dev/shm分区以避免共享内存问题
 33 # options.add_argument("--remote-debugging-port=9222")  # 启用远程调试端口
 34 options.add_experimental_option("excludeSwitches", ["enable-automation"])
 35 options.add_experimental_option("useAutomationExtension", False)
 36 
 37 
 38 MAX_ITEMS_BEFORE_WRITING = 5  # 每收集 5 条数据就写入一次文件
 39 
 40 def write_to_file(collected_items):
 41     print("当前工作目录:", os.getcwd())
 42     # 追加写入文件的逻辑
 43     with open('result.json', 'a', encoding='utf-8') as file:
 44         json_data = [json.dumps(item, ensure_ascii=False) for item in collected_items]
 45         file.write('\n'.join(json_data) + '\n')
 46     print("Results saved successfully.")
 47 
 48 #随机延时函数，用来模拟动作比较快的点击操作
 49 def random_delay(time_start, time_end):
 50     delay = random.uniform(time_start, time_end)
 51     time.sleep(delay)
 52 
 53 def scroll_and_collect(driver, num_items):
 54     collected_items = []
 55     collected_count = 0
 56     result_count = 0
 57     while result_count < num_items:
 58         # 获取了页面上的所有选项卡
 59         items = driver.find_elements(By.XPATH, "//section[@class='note-item']")
 60         # 遍历获取的列表，分析里面的元素
 61         for item in items:
 62             try:
 63                 # 找到元素里封面、标题、作者昵称、作者头像等元素
 64                 cover = item.find_element(By.XPATH, ".//a[@class='cover mask ld']/img").get_attribute("src")
 65                 title = item.find_element(By.XPATH, ".//a[@class='title']//span").text
 66                 author_avatar = item.find_element(By.XPATH, ".//a[@class='author']//img[@class='author-avatar']").get_attribute("src")
 67                 author_name = item.find_element(By.XPATH, ".//a[@class='author']//span").text
 68 
 69                 # 存储获取的结果
 70                 collected_items.append({
 71                     "cover": cover,
 72                     "title": title,
 73                     "author_avatar": author_avatar,
 74                     "author_name": author_name
 75                 })
 76                 result_count += 1
 77                 collected_count += 1
 78                 # 写入文件
 79                 if collected_count >= MAX_ITEMS_BEFORE_WRITING:
 80                     write_to_file(collected_items)
 81                     collected_items = []  # 清空已收集的项
 82                     collected_count = 0  # 重置计数器
 83             except NoSuchElementException:
 84                 continue
 85         # 翻页
 86         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
 87         random_delay(4, 8)  # 等待页面加载新的内容
 88 
 89         # 等待新内容加载的逻辑
 90         try:
 91             WebDriverWait(driver, 10).until(
 92                 EC.presence_of_element_located((By.XPATH, "//section[@class='note-item']"))
 93             )
 94         except NoSuchElementException:
 95             break
 96         # 最后一次写入剩余的项
 97     if collected_count > 0:
 98         write_to_file(collected_items)
 99     return collected_items
100 
101 # 初始化 WebDriver，并传入 ChromeDriver Service
102 driver = webdriver.Chrome(service=service, options=options)
103 # # 防反爬：隐藏 webdriver 特征
104 # driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => false});")
105 
106 try:
107     # 打开小红书首页
108     url = "https://www.xiaohongshu.com"
109     driver.get(url)
110 
111     #time.sleep(4)  # 等待页面加载完成,后面改成显式等待
112     # 显式等待，直到二维码登录出现
113     wait = WebDriverWait(driver, 5)
114     try:
115         # 通过Xpath表达式获取到XHS登录框的html元素，判断是否可以登录
116         ## 此处需要重新确认XPATH元素，修改不使用直接复制F12值
117         search_box = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='app']/div[1]/div/div[1]/div[2]")))
118         print("成功等待登录界面")
119     except Exception as e:
120         print(f"等待登录界面超时,错误信息：{e}")
121 
122     #读取之前的cookies
123     with open("cookies.json", "r") as f:
124         cookies = json.load(f)
125     # 添加每个cookie到浏览器
126     for cookie in cookies:
127         driver.add_cookie(cookie)
128     
129     #刷新页面
130     driver.refresh()
131 
132     wait = WebDriverWait(driver, 3)
133     # 等待登录按钮 **消失**（最多 3 秒）,也可以判断cookie是否有效
134     try:
135         wait.until(EC.invisibility_of_element_located((By.XPATH, '//*[@id="login-btn"]')))
136         print("成功登录小红书")
137     except Exception as e:
138         print(f"登录小红书失败,错误信息：{e}")
139 
140     time.sleep(5)
141      # 显式等待，直到搜索框出现
142     wait = WebDriverWait(driver, 10)
143     search_box = wait.until(EC.presence_of_element_located((By.XPATH, "//input[@placeholder='搜索小红书']")))
144     search_box.send_keys("漫展")
145     time.sleep(5)
146     search_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='input-button']")))
147     search_button.click()
148     time.sleep(5)
149     wait.until(EC.presence_of_element_located((By.XPATH, "//section[@class='note-item']")))
150     time.sleep(5)
151 
152     num_items = 100
153     scroll_and_collect(driver, num_items)
154 
155     '''##单次测试代码
156 
157     items = driver.find_elements(By.XPATH, "//section[@class='note-item']")
158     print(f"找到 {len(items)} 个笔记项")
159     print(items)
160     collected_items = []
161     collected_count = 0
162     result_count = 0
163     for item in items:
164         try:
165             # 找到元素里封面、标题、作者昵称、作者头像等元素
166             cover = item.find_element(By.XPATH, ".//a[@class='cover mask ld']/img").get_attribute("src")
167             title = item.find_element(By.XPATH, ".//a[@class='title']//span").text
168             author_avatar = item.find_element(By.XPATH, ".//a[@class='author']//img[@class='author-avatar']").get_attribute("src")
169             author_name = item.find_element(By.XPATH, ".//a[@class='author']//span").text
170 
171             # 存储获取的结果
172             collected_items.append({
173                 "cover": cover,
174                 "title": title,
175                 "author_avatar": author_avatar,
176                 "author_name": author_name
177             })
178             result_count += 1
179             collected_count += 1
180             # 写入文件
181             if collected_count >= MAX_ITEMS_BEFORE_WRITING:
182                 write_to_file(collected_items)
183                 collected_items = []  # 清空已收集的项
184                 collected_count = 0  # 重置计数器
185         except NoSuchElementException:
186             continue
187         
188         '''
189     # 延时几秒以便查看搜索结果
190     time.sleep(60)
191 
192 finally:
193     # 关闭 WebDriver
194     driver.quit()