python高级爬虫代码

707次阅读

没有评论

[python]代码库#! /usr/bin/env python

from sys import argv

from os import makedirs,unlink,sep

from os.path import dirname,exists,isdir,splitext

from string import replace,find,lower

from htmllib import HTMLParser

from urllib import urlretrieve

from urlparse import urlparse,urljoin

from formatter import DumbWriter,AbstractFormatter

from cStringIO import StringIO

class Retriever(object): # download web page

def __init__(self,url):

self.url = url

self.file = self.filename(url)

def filename(self,url,deffile='index.html'):

parsedurl = urlparse(url,'http',0) # parse path

path = parsedurl[1] + parsedurl[2]

ext = splitext(path)

if ext[1] == '': # no file,use default

if path[-1] == '/':

path += deffile

else:

path += '/' + deffile

ldir = dirname(path) # local directory

if sep != '/': # os-indep. path separator.

ldir = repalce(ldir,'/',sep)

if not isdir(ldir): # create archieve dir if nec.

if exists(ldir): unlink(ldir)

makedirs(ldir)

return path

def download(self): # download web page

try:

retval = urlretrieve(self.url,self.file)

except IOError:

retval = ('*** ERROR: invalid URL "%s"' % self.url)

return retval

def parseAndGetLinks(self): # parse HTML, save links

self.parser = HTMLParser(AbstractFormatter(\

DumbWriter(StringIO())))

self.parser.feed(open(self.file).read())

self.parser.close()

return self.parser.anchorlist

class Crawler(object): # manage entire crawling process

count = 0 # static download page counter

def __init__(self,url):

self.q = [url]

self.seen = []

self.dom = urlparse(url)[1]

def getPage(self,url):

r = Retriever(url)

retval = r.download()

if retval[0] == '*': # error situation, do not parse

print retval, '…skipping parse'

return

Crawler.count += 1

print '\n(', Crawler.count, ')'

print 'URL:',url

print 'FILE:',retval[0]

self.seen.append(url)

links = r.parseAndGetLinks() # get and process links

for eachlink in links:

if eachlink[:4] != 'http' and \

find(eachlink,'://') == -1:

eachlink = urljoin(url,eachlink)

print '* ',eachlink

if find(lower(eachlink),'mailto') != -1:

print '…discarded, mailto link'

continue

if eachlink not in self.seen:

if find(eachlink,self.dom) == -1:

print '…discarded, not in domain'

else:

if eachlink not in self.q:

self.q.append(eachlink)

print '…new, added to Q'

else:

print '…discarded, already in Q'

else:

print '…discarded, already processed'

def go(self): # process links in queue

while self.q:

url = self.q.pop()

self.getPage(url)

def main():

if len(argv) > 1:

url = argv[1]

else:

try:

url = raw_input('Enter starting URL: ')

except (KeyboardInterrupt,EOFError):

url = ''

if not url: return

robot = Crawler(url)

robot.go()

if __name__ == '__main__':

main()

python高级爬虫代码

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

发表于：Python爬虫

2022-10-28

# Python爬虫

复制链接

赏

python高级爬虫代码

相关文章：

HTTP代理设置详解：一步步配置指南

什么是Socks5代理IP及其优势

Socks5代理配置教程及注意事项

什么是代理服务器IP：如何选择合适的

国外代理服务器的优势及选择建议

如何找到可靠的免费代理服务器

在线代理服务器的使用与推荐

HTTP代理服务器的设置及应用实例

静态代理IP怎么填写：步骤与示例

海外静态IP的代理选择与配置