最简单的爬虫代码 python

626次阅读

没有评论

感兴趣Python爬虫最简单代码的小伙伴，下面一起跟随编程之家 jb51.cc的小编来看看吧。

#!/usr/bin/python

# -*- coding: UTF-8 -*-

import cookielib

import urllib2

from urllib import urlencode

import MysqLdb

from bs4 import BeautifulSoup

# 数据库

mydb = MysqLdb.connect(host="10.0.66.248",user="root",passwd="123456",db="invCloudOA",charset="utf8")

cursor = mydb.cursor()

mydb.autocommit(on="on")

# 建一个cookie处理器的opener

cookie = cookielib.CookieJar()

handler = urllib2.HTTPCookieProcessor(cookie)

conn = urllib2.build_opener(handler)

# 请求参数

param = {

'email': '88888888@qq.com','password': '88888888','target': 'http://aaaaaa.com/member'

}

# 发送请求

resp = conn.open("http://aaaaaa.com/user/login",data=urlencode(param))

# 登录成功后,开始采集数据

for i in xrange(1,200,1):

print("当前页码 %d" % i)

try:

resp_con = conn.open("http://aaaaaa.com/member/index/page/%d" % i,timeout=20)

resp_string = resp_con.read()

bs = BeautifulSoup(resp_string)

a_list = bs.select('li[class="col-sm-6 col-md-4 col-lg-6"] a')

try:

for a in a_list:

person_html = conn.open("http://aaaaaa.com%s" % a.get("href"),timeout=20)

person_soup = BeautifulSoup(person_html)

person_detail_soup = person_soup.find('div',class_='col-sm-12 col-md-8 detail-left min-padding')

try:

username = person_detail_soup.find('div',class_='detail').find('div',class_='introduce').find('h4').find('span').get_text()

except:

username = ""

try:

gongsi = person_detail_soup.find('div',class_='introduce').find('div',class_="local").find('a').get_text()

except:

gongsi = ""

try:

zhiwu = person_detail_soup.find('div',class_='introduce').find_all('div',class_="local")[1].get_text()

except:

pass

try:

diqu = person_detail_soup.find('div',class_="local")[2].find('a').get_text()

except:

diqu = ""

try:

shouji = person_detail_soup.find_all('div',class_='contacts')[0].get_text()

except:

shouji = ""

try:

qq = person_detail_soup.find_all('div',class_='contacts')[1].get_text()

except:

qq = ""

try:

weibo = person_detail_soup.find_all('div',class_='contacts')[2].get_text()

except:

weibo = ""

try:

email = person_detail_soup.find_all('div',class_='contacts')[3].get_text()

except:

email = ""

try:

weixin = person_detail_soup.find_all('div',class_='contacts')[4].get_text()

except:

weixin = ""

sql = "INSERT INTO`ft_person`(`mingzi`,`shouji`,`qq`,`weibo`,`emai`,`wexin`,`chanpinjingli`,`zhiyejingli`,`gerenjieshao`,`yuanid`,gongsi,zhiwu,diqu) VALUES('%s','%s','%s');" % (

username,shouji,qq,weibo,email,weixin,"",a.get("href"),diqu)

cursor.execute(sql)

print("采集成功 %s" % username.encode('utf-8'))

except StandardError as ex:

print(ex)

print("采集该人物失败 %s" % a.get("href"))

except StandardError as ex:

print(ex)

print("采集第 %d 页失败!" % i)

# 关闭数据库

cursor.close()

总结

以上是编程之家为你收集整理的python爬虫最简单代码全部内容，希望文章能够帮你解决python爬虫最简单代码所遇到的程序开发问题。

如果觉得编程之家网站内容还不错，欢迎将编程之家网站推荐给程序员好友。

本图文内容来源于网友网络收集整理提供，作为学习参考使用，版权属于原作者。

如您喜欢交流学习经验，点击链接加入交流1群：1065694478(已满)交流2群：163560250

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

发表于：Python爬虫

2022-10-28

# Python爬虫

复制链接

赏

最简单的爬虫代码 python

相关文章：

HTTP代理设置详解：一步步配置指南

什么是Socks5代理IP及其优势

Socks5代理配置教程及注意事项

什么是代理服务器IP：如何选择合适的

国外代理服务器的优势及选择建议

如何找到可靠的免费代理服务器

在线代理服务器的使用与推荐

HTTP代理服务器的设置及应用实例

静态代理IP怎么填写：步骤与示例

海外静态IP的代理选择与配置