python爬虫两个简单入门实例

1,320次阅读

一、建表
二、实例代码
- 实例一
- 实例二
总结

提示：安装python3环境和需要导入的库。以下是本篇文章正文内容，下面案例可供参考

CREATE TABLE article (
id int(11) unsigned NOT NULL AUTO_INCREMENT COMMENT ‘id’,
artname varchar(50) NOT NULL,
href text NOT NULL COMMENT ‘链接’,
content text NOT NULL COMMENT ‘详情’,
img text NOT NULL COMMENT ‘图片’,
PRIMARY KEY (id)
) ENGINE=InnoDB AUTO_INCREMENT=624 DEFAULT CHARSET=utf8mb4;

代码如下（示例）：

import requests from bs4 import BeautifulSoup # Beautiful Soup 最主要的功能是从网页抓取数据 import datetime import pymysql import time # 这个模块提供各种与时间相关的函数

# 连接数据库 connect = pymysql.connect( host='localhost', port=3306, user='root', passwd='jmroot', db='python', charset='utf8' )

def get_one_page(): headers = { # User–Agent头域的内容包含发出请求的用户信息 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' } # 开始时间 start_time = datetime.datetime.now() url = 'http://www.mama.cn/z/t1183/' # 图片保存路径没有自己手动创建 root = "E://reptile//images//"

response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") # 创建BeautifulSoup Python标准库对象 div = soup.find(class_='list-left') # find_all() 方法的返回结果是值包含一个元素的列表,而 find() 方法直接返回结果。返回分类 lists = div.find_all('li') # 搜索标题列表 for list in lists: # 遍历标题列表 title = list.find('a').string # 标题名称 href = list.find('a')['href'] # 标题链接 time.sleep(1) # 推迟执行的秒数 # 通过文章的url获取文章网页内容 page = requests.get(href, headers=headers) # 访问该标题的内容详情 web_text = BeautifulSoup(page.text, "html.parser") # 创建BeautifulSoup Python标准库对象 getContent = web_text.find(class_='detail-main') # 获取 class='detail-main' 下的所有内容 contents = getContent.find_all('p') # 获取 getContent 下所有p标签下的内容 content = '' # 拼接文章内容存到数据库 for i in contents: content = '{}{}'.format(content, i.string) # 拼接内容 try: div_imgs = web_text.find('div', class_='detail-mainImg') # 查询图片 imgs = div_imgs.find('img')['src'] # 查询图片地址 path = root + imgs.split("/")[–1] # 拼接图片存储地址 with open(path, "wb") as f: # 开始写文件，wb代表写二进制文件 f.write(requests.get(imgs).content) # 以字节形式（二进制）返回。 except(Exception): print("抱歉，找不到图片") inset_spec_code(title, href, content, path) # 插入数据 end_time = datetime.datetime.now() print((end_time – start_time).seconds) # seconds获取时间差（秒数）

# 获取游标 cursor = connect.cursor()

def inset_spec_code(artname, href, content, img): try: # 插入数据 sql = "INSERT INTO article(artname,href,content,img) VALUES ('%s','%s','%s','%s')"

data = (artname, href, content, img) cursor.execute(sql % data) connect.commit() print('成功插入', cursor.rowcount, '条数据') except Exception: print("插入失败")

if __name__ == '__main__': get_one_page()

import requests from bs4 import BeautifulSoup # Beautiful Soup 最主要的功能是从网页抓取数据 import datetime import pymysql import time # 这个模块提供各种与时间相关的函数

# 连接数据库 connect = pymysql.connect( host='localhost', port=3306, user='root', passwd='jmroot', db='python', charset='utf8' )

# 获取游标 cursor = connect.cursor()

class gain_data():

def spider(self): # 开始时间 start_time = datetime.datetime.now() isflow = True page = 1 while isflow: url = "https://www.demaila.com/case.html?page=" + str(page) html = self.load(url) # panduan = input("是否继续爬虫下一页(y/n)!") if html == True: # panduan == "y": isflow = True page += 1 else: isflow = False end_time = datetime.datetime.now() print((end_time – start_time).seconds) # seconds获取时间差（秒数）

def load(self, url): headers = { # User–Agent头域的内容包含发出请求的用户信息 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "html.parser") # 创建BeautifulSoup Python标准库对象 div = soup.find(class_='col-lg-12') lists = div.find_all(class_="card-body p-4") # 图片保存路径没有自己手动创建 root = "E://reptile//images//" i = False for list in lists: # 遍历标题列表 show = list.find(class_='h5') # 标题名称 title = show.find('a').string try: div_imgs = list.find('div', class_='dropdown-menu') # 查询图片 imgs = div_imgs.find('img')['src'] paths = imgs.split("?")[0] # 拼接图片存储地址 path = root + paths.split("/")[–1] # 拼接图片存储地址 with open(path, "wb") as f: # 开始写文件，wb代表写二进制文件 f.write(requests.get(imgs).content) # 以字节形式（二进制）返回。 except(Exception): print("抱歉，找不到图片") time.sleep(2) # 推迟执行的秒数 self.inset_spec_code(title, path) # 插入数据 i = True return i

def inset_spec_code(self, artname, img): try: # 插入数据 sql = "INSERT INTO article(artname,img) VALUES ('%s','%s')"

data = (artname, img) cursor.execute(sql % data) connect.commit() print('成功插入', cursor.rowcount, '条数据') except Exception: print("插入失败")

if __name__ == '__main__': n = gain_data() n.spider()

直接运行代码，就可以爬到你想要的内容了

好多人想学python不知如何入手，此代码最适合刚接触python的小白学习研究，如果帮助到你了，给个关注呗

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

发表于：Python爬虫

2022-10-28

# Python爬虫

复制链接

赏

python爬虫两个简单入门实例

python爬虫简单入门实例

一、建表

二、实例代码

实例一

实例二

总结

相关文章：

HTTP代理设置详解：一步步配置指南

什么是Socks5代理IP及其优势

Socks5代理配置教程及注意事项

什么是代理服务器IP：如何选择合适的

国外代理服务器的优势及选择建议

如何找到可靠的免费代理服务器

在线代理服务器的使用与推荐

HTTP代理服务器的设置及应用实例

静态代理IP怎么填写：步骤与示例

海外静态IP的代理选择与配置