scrapy爬取链接后再爬取链接内容

629次阅读

以下代码是在python3.6环境下测试通过

#!/usr/bin/python # -*- coding:utf-8 -*- from scrapy.http import Request from scrapy.spiders import Spider from scrapy.selector import Selector from storage.items import W3SchoolItem

class StorageSpider(Spider): """ 有三个必需的定义的成员:name,start_urls,parse() """ name = "storage" #这个spider的标识 allowed_domains = ["www.zyc56.org.cn"] #域名限制 start_urls = [ #一个url列表，spider从这些网页开始抓取 "http://www.zyc56.org.cn/index.php?m=content&c=index&a=lists&catid=31" ]

def parse(self, response): sel = Selector(response) item = StorageItem()

mainXpath = sel.xpath('//div[@class="map_intro clear"]') elseXpath = sel.xpath('//div[@class="map_article"]')

item['crawlUrl'] = response.url item['enterpriseName'] = mainXpath.xpath('dl/dd[1]/text()').extract() #公司名称 item['contactUser'] = mainXpath.xpath('dl/dd[2]/text()').extract() #联系人 item['contactNumber'] = mainXpath.xpath('dl/dd[3]/b/text()').extract() #联系电话 item['warehouseType'] = mainXpath.xpath('dl/dd[4]/text()').extract()#仓库类型 item['releaseTime'] = mainXpath.xpath('dl/dt/span/text()').extract()#发布时间

item['warehouseAddress'] = elseXpath.xpath('div/span/text()').extract() #所在地区 item['warehouseDetailAddr'] = elseXpath.xpath('div/text()[2]').extract() #所在详细地址

sonPath = elseXpath.xpath('table/tbody/tr/td[contains(text(),"仓库规模")]/following-sibling::td[position()=1]') if not len(sonPath): #空数组 sonPath = elseXpath.xpath('table/tbody/tr/td[contains(text(),"仓库建设方案")]/../following-sibling::tr/td[position()=2]')

item['warehouseSize'] = sonPath.xpath('normalize-space(translate(translate(string(.),"xa0",""),"平米",""))').extract()

if len(item['enterpriseName']): yield item

alinkList = sel.xpath('//dd[@class="intro"]/a/@href').extract() for alink in alinkList: yield Request(url=alink, callback=self.parse)

pipelines.py 文件代码如下：

# -*- coding: utf-8 -*-

# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#from scrapy.exporters import JsonItemExporter import pymysql

class StoragePipeline(object):

# def open_spider(self, spider): # #可选实现，当spider被开启时，这个方法被调用。 # #输出到 w3school_data_utf8.json 文件 # self.file = open('w3school_data_utf8.json', 'wb') # self.exporter = JsonItemExporter(self.file, encoding='utf-8') # self.exporter.start_exporting() # # def close_spier(self, spider): # #可选实现，当spider被关闭时，这个方法被调用 # self.exporter.finish_exporting() # self.file.close() # # def process_item(self, item, spider): # self.exporter.export_item(item) # return item def __init__(self): self.dbpool = pymysql.connect( host = '127.0.0.1', db = 'db_scrapy', user = 'root', passwd = 'abc123', charset = 'utf8' )

def process_item(self, item, spider): db = self.dbpool cur = db.cursor() try: cur.execute("insert into storage_info(enterprise_name, warehouse_address, warehouse_detail_addr, warehouse_size,warehouse_type, contact_user, contact_number, release_time, add_type, crawl_url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", #release_time, ( item['enterpriseName'][0][5:], item['warehouseAddress'][0], item['warehouseDetailAddr'][0].strip()[5:], item['warehouseSize'][0].strip(), item['warehouseType'][0][5:], item['contactUser'][0][4:], item['contactNumber'][0], item['releaseTime'][0][3:], 1, item['crawlUrl'] ) ) db.commit() except Exception as e: print('错误',format(e)) db.rollback() db.close()

return item

items.py 文件代码如下：

# -*- coding: utf-8 -*-

# Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class StorageItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() enterpriseName = scrapy.Field() warehouseAddress = scrapy.Field() warehouseDetailAddr = scrapy.Field() warehouseSize = scrapy.Field() warehouseType = scrapy.Field() releaseTime = scrapy.Field() contactUser = scrapy.Field() contactNumber = scrapy.Field() addType = scrapy.Field() crawlUrl = scrapy.Field()

settings.py 文件代码需修改如下配置：

# Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'storage.pipelines.StoragePipeline': 300, }

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

发表于：Python爬虫

2022-10-25

# Python爬虫

复制链接

赏

scrapy爬取链接后再爬取链接内容

相关文章：

HTTP代理设置详解：一步步配置指南

什么是Socks5代理IP及其优势

Socks5代理配置教程及注意事项

什么是代理服务器IP：如何选择合适的

国外代理服务器的优势及选择建议

如何找到可靠的免费代理服务器

在线代理服务器的使用与推荐

HTTP代理服务器的设置及应用实例

静态代理IP怎么填写：步骤与示例

海外静态IP的代理选择与配置