scrapy框架爬取知乎信息实例详解(详细)

289次阅读
没有评论
scrapy框架爬取知乎信息实例详解(详细)

爬取知乎的关注信息作为我们scrapy框架的详解例子,爬取的知乎大v是轮子哥,然后将爬取的信息存储进mongo数据库。我将所有解释都放进例子里。虽然每一句代码都有解释,但是最好还是有爬虫的基础。
整体思路:
1.选定一位有较多关注数的知乎达人作为我们的爬取对象
2.通过知乎接口获得获得该粉丝的关注列表和粉丝列表
3.通过递归的方法实现对列表中每一个用户的爬取,爬取他们的粉丝列表和关注列表。
4.通过知乎接口获得列表中的每位用户的详细信息。

zhihu.py

# -*- coding: utf-8 -*- # 出现500服务器响应的错误,原因是检测到我们不是通过浏览器访问的,我们在setting文件中修改headers # 的值,将user-agent加入。 import scrapy import json from scrapy import Request from zhihuuser.items import UserItem class ZhihuSpider(scrapy.Spider): name = 'zhihu' allowed_domains = ['www.zhihu.com'] start_url = ['http://www.zhihu.com/'] start_user='excited-vczh' user_url='https://www.zhihu.com/api/v4/members/{user}?include={include}' #user意思是用户的url,即轮子哥的url

user_query='allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics' follows_url='https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}' follows_query='data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' # follows意思是他关注的人的列表

followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}' followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' # followers_query和user_query都是后面的include #followers意思是关注他的列表 #limit和offset也是变量,limit是固定值,而offset则根据页面的变化而变化

def start_requests(self): # url='https://www.zhihu.com/api/v4/members/{user}/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20' # url='https://www.zhihu.com/api/v4/members/excited-vczh/publications?include=data%5B*%5D.cover%2Cebook_type%2Ccomment_count%2Cvoteup_count&offset=0&limit=5' yield Request(self.user_url.format(user=self.start_url,include=self.user_query), self.parse_user) yield Request(self.follows_url.format(user=self.start_user,include=self.follows_query,offset=0,limit=20), callback=self.parse_follows) yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, offset=0, limit=20), callback=self.parse_followers) # 使用format方法是为了动态的构造url # 同时也需要指定回调函数来进行url的解析

def parse_user(self, response): result = json.loads(response.text) # 利用json.load声明一个json对象 item = UserItem() for field in item.fields: # 利用item的field属性进行赋值,实际上field输出的是所有集合的名称 if field in result.keys(): # 如果field是result的键名之一则field进行赋值 item[field] = result.get(field) yield item yield Request(self.follows_url.format(user=result.get('url_token'),include=self.follows_query,offset=0,limit=20,callback=self.parse_follows)) # 获取关注列表的request yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, offset=0, limit=20,callback=self.parse_followers))

def parse_follows(self, response): result = json.loads(response.text) if 'data' in result.keys(): for result in result.get('data'): yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user) if 'paging' in result.keys() and result.get('pagging').get('is_end') == False: next_page = result.get('pagging').get('next') # 从pagging的next键得到分页下一页的链接 yield Request(next_page,self.parse_follows) #传入url链接,新建一个request请求,然后回调parse_follows

def parse_followers(self, response): result = json.loads(response.text) if 'data' in result.keys(): for result in result.get('data'): yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user) if 'paging' in result.keys() and result.get('pagging').get('is_end') == False: next_page = result.get('pagging').get('next') # 从pagging的next键得到分页下一页的链接 yield Request(next_page,self.parse_followers) #传入url链接,新建一个request请求,然后回调parse_follows

item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html

import scrapy from scrapy import Item, Field class UserItem(scrapy.Item): id = Field() name = Field() avatar_url = Field() headline = Field() url_token =Field() url = Field() avatar_url_template = Field() type = Field()

# define the fields for your item here like: # name = scrapy.Field()

setting.py

# -*- coding: utf-8 -*-

# Scrapy settings for zhihuuser project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'zhihuuser'

SPIDER_MODULES = ['zhihuuser.spiders'] NEWSPIDER_MODULE = 'zhihuuser.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)'

# Obey robots.txt rules ROBOTSTXT_OBEY = False # 默认为true,意思是遵守rouot协议,我们改成False

# Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default) #COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False

# Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36' }

# Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'zhihuuser.middlewares.ZhihuuserSpiderMiddleware': 543, #}

# Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'zhihuuser.middlewares.ZhihuuserDownloaderMiddleware': 543, #}

# Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #}

# Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'zhihuuser.pipelines.ZhihuuserPipeline': 300, #}

# Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

相关文章:

版权声明:Python教程2022-10-25发表,共计7428字。
新手QQ群:570568346,欢迎进群讨论 Python51学习