针对不同网站爬虫思考

1,801次阅读

对于没有反爬机制的网站，可以直接使用request爬取，可加入header请求头和延长等待时间

示例网站：首页 – 信息安全漏洞门户 VULHUB

import requests from bs4 import BeautifulSoup import time

# get 网站文本信息 def get_html(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',

} r = requests.get(url, headers=headers, timeout=60) return r.text except: return " ERROR "

def get_content(url): html = get_html(url) soup = BeautifulSoup(html, 'lxml') liTags = soup.find_all('tr') count = 0 for li in liTags: try: resource = 'http://vulhub.org.cn/' # 获取来源 id = 'http://vulhub.org.cn/' + li.find('a')['href'] # 标识 submit_time = li.find('td', attrs={'class': 'hidden-xs'}).text.strip() # 提交时间 vul_level = li.find('span', attrs={'class': 'grade'})['title'] # 漏洞等级 vul_title = li.find_all('td')[3].text.strip() # 漏洞名称 chinese_data = li.find('span', attrs={'class': 'chinese_True'})['title'] # 有无中文数据 explit_data = li.find('span', attrs={'class': 'exploit_False'})['title'] # 漏洞利用代码 detail_data = li.find('span', attrs={'class': 'details_True'})['title'] # 漏洞细节 software_data = li.find('span', attrs={'class': 'software_False'})['title'] # 漏洞载体 analyzer_data = li.find('span', attrs={'class': 'analyzer_False'})['title'] # 漏洞检测脚本 populator = li.find_all('td')[5].text.strip() #人气

count += 1 value = (resource, id,submit_time,vul_level,vul_title,chinese_data,explit_data,detail_data,software_data,analyzer_data,populator) print(value) data_insert(value, count) except: continue

def main(deep): url_list = [] count = 0 for i in range(0, deep): url_list.append('http://vulhub.org.cn/vulns/{}?view=global'.format(i+1))

for url in url_list: count += 1 print("正在爬取第{}|{}页:{}".format(count+1, len(url_list), url)) print(get_content(url)) time.sleep(10)

对于有反爬机制的网站，可以采用模拟浏览器的方式爬虫

示例：Exploit Database – Exploits for Penetration Testers, Researchers, and Ethical Hackers

import time from selenium import webdriver from bs4 import BeautifulSoup import requests

caps = { 'browserName': 'chrome', 'loggingPrefs': { 'browser': 'ALL', 'driver': 'ALL', 'performance': 'ALL', }, 'goog:chromeOptions': { 'perfLoggingPrefs': { 'enableNetwork': True, }, 'w3c': False, }, } caps['goog:loggingPrefs'] = {'performance': 'ALL'}

# driver = webdriver.Chrome(desired_capabilities=caps)

# todo 填写自己的path driver = webdriver.Chrome(executable_path='/chromedriver.exe') driver.maximize_window() driver.get("https://www.exploit-db.com/") driver.implicitly_wait(4) time.sleep(3) driver.find_element_by_xpath('//*[@id="app"]/div/div/div[2]/div[1]/div[1]/div/div[1]/div[1]/label').click()#筛选信任 time.sleep(3) next = driver.find_element_by_xpath('//*[@id="exploits-table_next"]/a') # next按钮 next_button = driver.find_element_by_xpath('//*[@id="exploits-table_next"]')

def get_detail(title_link): """ 进入title_link获取详细信息 :param title_link: :return: """ try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36', 'cookie': } r = requests.get(title_link, headers=headers) soup = BeautifulSoup(r.text, 'lxml') edb_id = soup.find_all('h6', attrs={'class': 'stats-title'})[0].text.strip() try: cve = soup.find_all('h6', attrs={'class': 'stats-title'})[1].find('a').text.strip() except: cve = 'N/A' return edb_id, cve except: return " error1", "error2"

def get_content(): """ 获取内容 :return: """ soup = BeautifulSoup(driver.page_source, 'lxml') liTags_odd = soup.find_all('tr', attrs={'class': 'odd'}) liTags_even = soup.find_all('tr', attrs={'class': 'even'}) liTags = liTags_even + liTags_odd count = 0 for li in liTags: try: datetime = li.find_all('td')[0].text.strip() #time exploit_download_link = 'https://www.exploit-db.com' + li.find('a')['href'] # 下载链接 try: Vulnerable_Application_link = 'https://www.exploit-db.com' + li.find_all('td')[2].find('a')['href'] except: Vulnerable_Application_link = "None" title = li.find_all('td')[4].text.strip() cate = li.find_all('td')[5].text.strip() #type plateform = li.find_all('td')[6].text.strip() author = li.find_all('td')[7].text.strip() title_link = 'https://www.exploit-db.com' + li.find_all('td')[4].find('a')['href'] edb_id, cve = get_detail(title_link) #点击title_link 进入，获取详细信息

count += 1 value = (datetime, exploit_download_link, Vulnerable_Application_link,title,cate,plateform,author,title_link,edb_id,cve) print(value) data_insert(value, count) except: continue

def main(): count = 0 # 找到下一next的按钮 next_button = driver.find_element_by_xpath('//*[@id="exploits-table_next"]') while True: count += 1 try: if 'disabled' not in next_button.get_attribute('class'): time.sleep(3) print("正在爬取第{}页".format(count)) next = driver.find_element_by_xpath('//*[@id="exploits-table_next"]/a') next.click() get_content() except: next_button = driver.find_element_by_xpath('//*[@id="exploits-table_next"]') if 'disabled' not in next_button.get_attribute('class'): time.sleep(3) print("正在爬取第{}页".format(count)) next = driver.find_element_by_xpath('//*[@id="exploits-table_next"]/a') next.click() get_content()

比如网站https://www.seebug.org，由于创宇对selenium的指纹特征已经进行了反爬，所以不能直接调用chrome插，需要手动开启浏览器，避免因为使用爬虫打开网页它被检测到。方法：（1）、手动开启chrome做法，在chrome的安装目录下，运行CMD命令：chrome.exe –remote-debugging-port=9222 –user-data-dir="C:selenumAutomationProfile"

（2）、这时候会自动打开一个chrome浏览器，这样会被认定为是人工打开，打开待爬取的网页，打开网址 https://www.seebug.org/vuldb/vulnerabilities

（3）、然后在代码修改自己的chrome_driver路径，运行脚本即可注意：它会直接在你打开的页面进行翻页，所以最开始爬的时候要切换到第一页再爬取，如果中间断掉也可以在断掉的网页直接往后爬取

import requests import time from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By

options = Options() options.add_experimental_option('debuggerAddress', '127.0.0.1:9222') # todo 自己的path chrome_driver="chromedriver.exe" driver = webdriver.Chrome(chrome_driver, chrome_options=options)

def get_content(): """ 获取当前浏览器页面的内容 :return: """ table_loc = (By.XPATH, '/html/body/div[2]/div/div/div/div/table') liTags = driver.find_element(*table_loc).find_elements(By.TAG_NAME, 'tr') count = 0 for li in liTags: try: resource = 'https://www.seebug.org/' # 获取来源 cve_id = li.find_element_by_class_name('fa-id-card').get_attribute('data-original-title') ssv_id = li.find_element_by_tag_name('a').text submit_time = li.find_element_by_class_name('datetime').text vul_level = li.find_element_by_class_name('vul-level').get_attribute('data-original-title') # 漏洞等级 vul_title = li.find_element_by_class_name('vul-title').text # 漏洞名称 wea_poc = li.find_element_by_class_name('fa-rocket').get_attribute('data-original-title') # 有无poc wea_range = li.find_element_by_class_name('fa-bullseye').get_attribute('data-original-title') # 有无靶场 wea_detail = li.find_element_by_class_name('fa-file-text-o').get_attribute( 'data-original-title') # 有无详情 wea_icon = li.find_element_by_class_name('fa-signal').get_attribute('data-original-title') # 有无图表 wea_exp = '无exp'

count += 1 value = (resource, cve_id, ssv_id,submit_time, vul_level, vul_title, wea_poc, wea_range, wea_detail, wea_icon,wea_exp) print(value) data_insert(value, count) except: continue

def main(): count = 0 while True: if driver.find_element_by_class_name('fa-chevron-right'): # > 箭头存在的 time.sleep(1) count += 1 print("正在爬取第{}页:{}".format(count, driver.current_url)) driver.find_element_by_class_name('fa-chevron-right').click() get_content() time.sleep(1)

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

发表于：Python爬虫

2022-10-25

# Python爬虫

复制链接

赏

针对不同网站爬虫思考

想做一个漏洞信息数据库，需要爬取几个漏洞检测网站的信息，发现这几个网站或多或少存在一些反爬机制，故针对不同的网站可以采取不同的爬虫策略

1、request直接爬取

2、使用selenium模拟浏览器爬虫

3、对于模拟浏览器也能识别的网站，可以采用需要手动开启浏览器

相关文章：

HTTP代理设置详解：一步步配置指南

什么是Socks5代理IP及其优势

Socks5代理配置教程及注意事项

什么是代理服务器IP：如何选择合适的

国外代理服务器的优势及选择建议

如何找到可靠的免费代理服务器

在线代理服务器的使用与推荐

HTTP代理服务器的设置及应用实例

静态代理IP怎么填写：步骤与示例

海外静态IP的代理选择与配置