python爬虫五大实例，爬虫初始与PyCharm安装

966次阅读

一：腾讯动漫爬虫代码

import requests from fake_useragent import UserAgent import re import threading

def replace_mark(my_str): return my_str.replace(",", "，").replace('"', "“")

def format_html(html): li_pattern = re.compile( '<li\sclass="ret-search-item clearfix">[\s\S]+?</li>') title_url_pattern = re.compile( '<a href="(.*?)" target="_blank" title=".*?">(.*?)</a>') sign_pattern = re.compile('签约') exclusive_pattern = re.compile('独家') author_pattern = re.compile( '(.*?)') tags_pattern = re.compile('(.*?)') score_pattern = re.compile('人气：(.*?)') items = li_pattern.findall(html) for item in items: title_url = title_url_pattern.search(item) title = title_url.group(2) url = title_url.group(1) sign = 0 exclusive = 0 if sign_pattern.search(item) is not None: sign = 1 if exclusive_pattern.search(item) is not None: exclusive = 1

author = author_pattern.search(item).group(1)

tags = tags_pattern.findall(item)

score = score_pattern.search(item).group(1) lock.acquire() with open("./qq.csv", "a+", encoding="utf-8") as f: f.write( f'{replace_mark(title)},{url},{sign},{exclusive},{replace_mark(author)},{"#".join(tags)},"{replace_mark(score)}" \n')

lock.release()

def run(index):

ua = UserAgent(use_cache_server=False)

response = requests.get( f"https://ac.qq.com/Comic/index/page/{index}", headers={'User-Agent': ua.random}) html = response.text format_html(html) semaphore.release()

lock = threading.Lock() if __name__ == "__main__":

num = 0

semaphore = threading.BoundedSemaphore(5) lst_record_threads = [] for index in range(1, 462): print(f"正在抓取{index}") semaphore.acquire() t = threading.Thread(target=run, args=(index, )) t.start() lst_record_threads.append(t)

for rt in lst_record_threads: rt.join()

print("数据爬取完毕")

二：19lou相亲爬虫

import requests from lxml import etree from fake_useragent import UserAgent import time

def save(src, title): try: res = requests.get(src) with open(f"imgs/{title}.jpg", "wb+") as f: f.write(res.content) except Exception as e: print(e)

def run(url): # ua = UserAgent(cache=False) ua = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36" headers = { "User-Agent": ua, "Host": "www.19lou.com", "Referer": "https://www.19lou.com/r/1/19lnsxq-233.html", "Cookie": "_Z3nY0d4C_=37XgPK9h" # 从反爬代码中获取到的值 } try: res = requests.get(url=url, headers=headers) text = res.text # 将 html 转换成 Element 对象 html = etree.HTML(text) # xpath 路径提取 @class 为选取 class 属性 divs = html.xpath("//div[@class='pics']") # print(len(divs)) # 遍历 Elements 节点 for div in divs: # 提取地址，注意提取的属性为 data-src 而不是 src src = div.xpath("./img/@data-src")[0] # 提取标题 title = div.xpath("./img/@alt")[0] save(src, title) except Exception as e: print(e)

if __name__ == '__main__': urls = ["https://www.19lou.com/r/1/19lnsxq.html"] for i in range(114, 243): urls.append(f"https://www.19lou.com/r/1/19lnsxq-{i}.html") for url in urls: print(f"正在抓取{url}") run(url) # time.sleep(5)

print("全部爬取完毕")

三：小吃加盟店数据

import requests from lxml import etree import time import re import random

USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ]

def run(url, index): try: headers = {"User-Agent": random.choice(USER_AGENTS)} res = requests.get(url=url, headers=headers) res.encoding = "utf-8" html = res.text with open(f"./html/{index}.html", "w+", encoding="utf-8") as f: f.write(html) except Exception as e: print(e)

def list_str(my_list): return ",".join(my_list)

def get_data(): for i in range(1, 130): with open(f"./html/{i}.html", "r", encoding="utf-8") as f: html = f.read() element = etree.HTML(html) # contains 函数获取包含 xxx 的元素，类似的还有 starts-with,ends-with,last origin_li = element.xpath("//ul[contains(@class,'xm-list')]/li") # 循环抓取 li 内部数据 for item in origin_li:

# 提取加盟名称 # title = item.xpath(".//div[@class='r']/h4/text()")[0] title = item.xpath("./div[@class='top']/a/@title")[0] # 提取超链接 detail_link = "http:" + item.xpath("./div[@class='top']/a/@href")[0]

# 提取特殊标签 special_tag = list_str(item.xpath("./@class")) # 当包含特殊标签 xm-list2 时，使用不同的提取规则

if special_tag != "xm-list2": # 提取标签 tags = "#".join(item.xpath(".//div[@class='bot']/span[@class='label']/text()")) # 提取投资价格 price = list_str(item.xpath(".//div[@class='bot']/span[@class='money']/b/text()")) # 地址和行业 city_industry = list_str(item.xpath("./div[@class='bot']/p/span/text()"))

long_str = f"{title},{detail_link}, {tags}, {price}, {city_industry}" save(long_str) else: # 地址和行业 city_industry = list_str(item.xpath("./div[@class='top']/a/div/p[2]/span/text()")) long_str = f"{title},{detail_link}, {0}, {0}, {city_industry}" save(long_str)

def save(long_str): try: with open(f"./jiameng.csv", "a+",encoding="utf-8") as f: f.write("\n"+long_str) except Exception as e: print(e)

if __name__ == '__main__':

# for i in range(1, 130): # print(f"正在爬取第{i}页数据") # run(f"https://www.3158.cn/xiangmu/canyin/?pt=all&page={i}", i)

get_data()

print("全部提取完毕")

四：音频爬虫

import time

import asyncio import aiohttp

from bs4 import BeautifulSoup import lxml

async def get_html(client, url): print("正在采集", url) async with client.get(url, timeout=5) as resp: html = await resp.text() soup = BeautifulSoup(html, 'lxml') divs = soup.find_all(attrs={'class': 'img_mini'}) mp3_urls = [get_mp3_url("https://www.bensound.com/" + div.a.img["src"]) for div in divs] return mp3_urls

def get_mp3_url(img_url): img_url = img_url name = img_url[img_url.rfind("/") + 1:img_url.rfind(".")]

mp3_url = f"https://www.bensound.com/bensound-music/bensound-{name}.mp3" return mp3_url

async def get_mp3_file(client, url): print("正在采集 mp3 文件", url) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36", "Referer": "https://www.bensound.com/royalty-free-music" } mp3_file_name = url[url.rfind('-') + 1:url.rfind('.')] print(mp3_file_name) async with client.get(url, headers=headers) as resp: content = await resp.read() with open(f'./mp3/{mp3_file_name}.mp3', "wb") as f: f.write(content) return (url, "success")

async def main(urls): timeout = aiohttp.ClientTimeout(total=600) # 将超时时间设置为600秒 connector = aiohttp.TCPConnector(limit=50) # 将并发数量降低 async with aiohttp.ClientSession(connector=connector, timeout=timeout) as client: tasks = [asyncio.ensure_future(get_html(client, urls[i])) for i in range(len(urls))]

dones, pendings = await asyncio.wait(tasks) print("异步执行完毕，开始输出对应结果：") all_mp3 = [] for task in dones: all_mp3.extend(task.result())

totle = len(all_mp3) print("累计获取到【", totle, "】个 MP3 文件") print("_" * 100) print("准备下载 MP3 文件")

# 每次下载10个 totle_page = totle // 10 if totle % 10 == 0 else totle // 10 + 1

for page in range(0, totle_page): print("正在下载第{}页 MP3 文件".format(page + 1)) start_page = 0 if page == 0 else page * 10 end_page = (page + 1) * 10 print("待下载地址") print(all_mp3[start_page:end_page]) mp3_download_tasks = [asyncio.ensure_future(get_mp3_file(client, url)) for url in all_mp3[start_page:end_page]] mp3_dones, mp3_pendings = await asyncio.wait(mp3_download_tasks) for task in mp3_dones: print(task.result())

if __name__ == '__main__': url_format = "https://www.bensound.com/royalty-free-music/{}" urls = [url_format.format(i) for i in range(1, 5)] start_time = time.perf_counter() asyncio.run(main(urls)) print("aiohttp 异步采集消耗时间为：", time.perf_counter() – start_time)

五：手机壁纸抓取

import requests import re import threading

headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"}

# 循环获取 URL

def get_image(base_url):

res = requests.get( url=base_url, headers=headers)

if res is not None: html = res.text

pattern = re.compile( '<img lazysrc="(.*?)" lazysrc2x=".*?" width="221" height="362" alt=".*?" title="(.*?)"') match_list = pattern.findall(html) for url, title in match_list: save_image(url[:url.find('jpg')+3], title)

print(match_list)

def save_image(url, title): try: print(f"{title} – {url}") res = requests.get(url=url, headers=headers)

if res is not None: html = res.content

with open(f"images/{title}.jpg", "wb+") as f: f.write(res.content) except Exception as e: print(e)

if __name__ == '__main__': num = 0 # 最多开启5个线程 semaphore = threading.BoundedSemaphore(5) for index in range(189): t = threading.Thread(target=get_image, args=( f"https://www.3gbizhi.com/sjbz/index_{index}.html",)) t.start() while threading.active_count() != 1: pass else: print('所有线程运行完毕')

爬虫：一段自动抓取互联网信息的程序，从互联网上抓取对于我们有价值的信息。

Python 爬虫架构主要由五个部分组成，分别是调度器、URL管理器、网页下载器、网页解析器、应用程序（爬取的有价值数据）。

调度器：相当于一台电脑的CPU，主要负责调度URL管理器、下载器、解析器之间的协调工作。
URL管理器：包括待爬取的URL地址和已爬取的URL地址，防止重复抓取URL和循环抓取URL，实现URL管理器主要用三种方式，通过内存、数据库、缓存数据库来实现。
网页下载器：通过传入一个URL地址来下载网页，将网页转换成一个字符串，网页下载器有urllib2（Python官方基础模块）包括需要登录、代理、和cookie，requests(第三方包)
网页解析器：将一个网页字符串进行解析，可以按照我们的要求来提取出我们有用的信息，也可以根据DOM树的解析方式来解析。网页解析器有正则表达式（直观，将网页转成字符串通过模糊匹配的方式来提取有价值的信息，当文档比较复杂的时候，该方法提取数据的时候就会非常的困难）、html.parser（Python自带的）、beautifulsoup（第三方插件，可以使用Python自带的html.parser进行解析，也可以使用lxml进行解析，相对于其他几种来说要强大一些）、lxml（第三方插件，可以解析 xml 和 HTML），html.parser 和 beautifulsoup 以及 lxml 都是以 DOM 树的方式进行解析的。
应用程序：就是从网页中提取的有用数据组成的一个应用。

下面用一个图来解释一下调度器是如何协调工作的:

python爬虫五大实例，爬虫初始与PyCharm安装

#!/usr/bin/python # -*- coding: UTF-8 -*- import cookielib import urllib2 url = "http://www.baidu.com" response1 = urllib2.urlopen(url) print "第一种方法" #获取状态码，200表示成功 print response1.getcode() #获取网页内容的长度 print len(response1.read()) print "第二种方法" request = urllib2.Request(url) #模拟Mozilla浏览器进行爬虫 request.add_header("user-agent","Mozilla/5.0") response2 = urllib2.urlopen(request) print response2.getcode() print len(response2.read()) print "第三种方法" cookie = cookielib.CookieJar() #加入urllib2处理cookie的能力 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) urllib2.install_opener(opener) response3 = urllib2.urlopen(url) print response3.getcode() print len(response3.read()) print cookie

Beautiful Soup: Python 的第三方插件用来提取 xml 和 HTML 中的数据，官网地址 Beautiful Soup: We called him Tortoise because he taught us.

1、安装 Beautiful Soup

打开 cmd（命令提示符），进入到 Python（Python2.7版本）安装目录中的 scripts 下，输入 dir 查看是否有 pip.exe, 如果用就可以使用 Python 自带的 pip 命令进行安装，输入以下命令进行安装即可：

pip install beautifulsoup4

2、测试是否安装成功

编写一个 Python 文件，输入:

import bs4 print bs4

运行该文件，如果能够正常输出则安装成功。

#!/usr/bin/python # -*- coding: UTF-8 -*- import re from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> The Dormouse's story Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well. … """ #创建一个BeautifulSoup解析对象 soup = BeautifulSoup(html_doc,"html.parser",from_encoding="utf-8") #获取所有的链接 links = soup.find_all('a') print "所有的链接" for link in links: print link.name,link['href'],link.get_text() print "获取特定的URL地址" link_node = soup.find('a',href="http://example.com/elsie") print link_node.name,link_node['href'],link_node['class'],link_node.get_text() print "正则表达式匹配" link_node = soup.find('a',href=re.compile(r"ti")) print link_node.name,link_node['href'],link_node['class'],link_node.get_text() print "获取P段落的文字" p_node = soup.find('p',class_='story') print p_node.name,p_node['class'],p_node.get_text()

PyCharm 是由 JetBrains 打造的一款 Python IDE。

PyCharm 具备一般 Python IDE 的功能，比如：调试、语法高亮、项目管理、代码跳转、智能提示、自动完成、单元测试、版本控制等。

另外，PyCharm 还提供了一些很好的功能用于 Django 开发，同时支持 Google App Engine，更酷的是，PyCharm 支持 IronPython。

PyCharm 官方下载地址：Download PyCharm: Python IDE for Professional Developers by JetBrains

效果图查看：

python爬虫五大实例，爬虫初始与PyCharm安装