requests 分类多级页面_scrapy框架爬取多级页面
spides.py
# -*- coding: utf-8 -*-
import scrapy
from weather.items import WeatherItem
from scrapy.crawler import CrawlerProcess
import re
'''
多級分類爬取
'''
class IgxSpider(scrapy.Spider):
name = 'igx_result'
allowed_domains = ['www.igxpt.com']
# start_urls = ['http://www.igxpt.com/cate/192/']
def start_requests(self):
start_urls = ['http://www.igxpt.com/cate/{}/'.format(str(i)) for i in range(192, 194)] #這里我是簡寫的,當然也可以進入主頁面,爬取這些url 那就要多一級分類了
for url in start_urls:
yield scrapy.Request(url=url)
def parse(self, response):
'''得到分頁頁碼-----start'''
page = response.xpath('//div[@class="dataTables_paginate paging_simple_numbers"]/span/text()').extract_first()
ret = re.search('共(\d+)頁', page)
number = ret.group(1)
#print(page,number)
page_link = response.xpath('//ul[@class="pagination"]/li/a/@href').extract_first()
current_url = 'http://www.igxpt.com'+page_link.split('=')[0]+'='
'''得到分頁頁碼-----end'''
clearfix = response.xpath('//ul[@class="shop-list-recommend mt20 clearfix"]/li')
for li in clearfix:
item = WeatherItem()
item['name'] = li.xpath('./a/p[1]/text()').extract_first()
url_img = li.xpath('./a/div/img/@src').extract_first()
item['url'] = "http://www.igxpt.com" + (url_img)
price_alia = li.xpath('./a/p[2]/span[@class="blue"]/text()').extract_first()
item['price'] = price_alia + "元"
yield item
# 拼接url 遞歸調用分頁
urls = [current_url + '{}'.format(str(i)) for i in range(1, int(number) + 1)]
for se in urls:
yield scrapy.Request(url=se, callback=self.parse)
items.py
import scrapy
class WeatherItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
url = scrapy.Field()
price= scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import urllib.request
class WeatherPipeline(object):
def process_item(self, item, spider):
name = item['name']
url = item['url']
price= item['price']
connection = pymysql.connect(
host='127.0.0.1',
user='root',
passwd='root',
db='scrapy',
# charset='utf-8',
cursorclass=pymysql.cursors.DictCursor
)
try:
# '''下載圖片'''
# imgname = url.split('/')[-1]
# path = r"D:\Python\weather\weather\images\%s" % (imgname)
# urllib.request.urlretrieve(url, filename=path)
'''插入數據庫'''
with connection.cursor() as cursor:
sql = """INSERT INTO `goods_info_detail` (name, url, price) VALUES (%s, %s, %s) """
cursor.execute(
sql,(name,url,price)
)
connection.commit()
except ValueError as e:
print(e)
finally:
connection.close()
return item
settings.py
LOG_LEVEL = 'WARNING'
BOT_NAME = 'weather'
SPIDER_MODULES = ['weather.spiders']
NEWSPIDER_MODULE = 'weather.spiders'
'''
管道
'''
ITEM_PIPELINES = {
'weather.pipelines.WeatherPipeline': 300,
}
總結
以上是生活随笔為你收集整理的requests 分类多级页面_scrapy框架爬取多级页面的全部內容,希望文章能夠幫你解決所遇到的問題。
 
                            
                        - 上一篇: 紧急救命3剧情介绍
- 下一篇: 摩尔庄园手游载具怎么更换
