python 百度搜索页抽取
生活随笔
收集整理的這篇文章主要介紹了
python 百度搜索页抽取
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
python 百度搜索頁抽取
百度搜索.py import os import re import time import requests import openpyxl from urllib.parse import urlencode from bs4 import BeautifulSoupclass BaiDuSearch(object):def __init__(self):self.session = requests.session()self.excel = Noneself.headers = {"Connection": "keep-alive","User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"}self.count = 3 # 獲取頁面深度;默認3頁self.time = 2 # 獲取下一頁間隔時間# 獲取請求狀態def get_status(self, url):response = self.session.get(url, headers=self.headers)if response.status_code == 200:return responseelse:print("網絡連接失敗!")return None# 訪問首頁,建立連接def index(self, url):response = self.get_status(url)if not response:return Falsereturn True# 網頁搜索def search(self, url):response = self.get_status(url)html = response.textreturn html# 網頁解析@staticmethoddef parse(html):title_url = {}soup = BeautifulSoup(html, "html5lib")content_lefts = soup.select("#content_left > .result.c-container")for result in content_lefts:a = result.select("h3 > a")if not a:continuetitle = a[0].texturl = a[0].get("href")title_url[title] = urlreturn title_url# 初始化 exceldef initialize_excel(self, path):if os.path.exists(path):self.excel = openpyxl.load_workbook(path)else:self.excel = openpyxl.Workbook()headline_data = {"title": "url",}self.write_to_excel(path, headline_data)# 寫入excel文件def write_to_excel(self, path, title_url):# 獲得所有sheet的名稱sheet_names = self.excel.sheetnames# 根據sheet名字獲得sheet# sheet = read_excel["Sheet1"]# 根據獲取第一個sheet對象sheet = self.excel[sheet_names[0]]for title in title_url:print(title, title_url[title])sheet.append((title, title_url[title]))self.excel.save(path)# 翻頁def page_parse(self, url, html):soup = BeautifulSoup(html, "html5lib")pages = soup.select("#page > a")for page in pages:title = page.textif title == "下一頁>":href = page.get("href")self.count += 1get_url = re.findall(r"(https://.*?)/", url)url = get_url[0] + href# print(href)# print(url)return url# 獲取翻頁def get_page(self, url, html):page_url_list = []for i in range(self.count):url = self.page_parse(url, html)print("獲取下一頁:", url)page_url_list.append(url)time.sleep(self.time)# 網頁搜索html = self.search(url)yield url# print(len(page_url_list))# return page_url_list# 翻頁def next_page(self, url, html, path):page_url_list = self.get_page(url, html)for url in page_url_list:# 網頁搜索html = self.search(url)# 解析htmltitle_url = self.parse(html)# 寫入excel文件self.write_to_excel(path, title_url)def main(self):# 首頁url = "https://www.baidu.com"self.index(url)# 初始化 ecxelpath = os.path.abspath(os.path.join(os.getcwd(), "百度搜索信息.xlsx"))self.initialize_excel(path)url = "https://www.baidu.com/s?"data = {'wd': '代理',}# 組建 urlurl = url + urlencode(data)# 網頁搜索html = self.search(url)# 翻頁self.next_page(url, html, path)self.excel.close() # 關閉excelif __name__ == '__main__':bd = BaiDuSearch()bd.main()
總結
以上是生活随笔為你收集整理的python 百度搜索页抽取的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: java的rest异步调用_使用Asyn
- 下一篇: 深圳达内培训python学费