温馨提示:
1. http/https 网页均可适用
2. 在第一级 tutorial 目录下运行如下命令查看结果:scrapy crawl kdl
3. scrapy 不是 python 原生库,需要安装才能使用: pip install scrapy
Scrapy项目目录
运行命令:scrapy startproject tutorial 新建 Scrapy 项目,创建包含下列内容的 tutorial 目录
tutorial/
scrapy.cfg # 项目的配置文件
tutorial/ # 该项目的python模块。之后您将在此加入代码
__init__.py
items.py # 项目中的item文件
pipelines.py # 项目中的pipelines文件
settings.py # 项目的设置文件
spiders/ # 放置spider代码的目录 __init__.py
...
zdaye_spider.py
编写爬虫:在 tutorial/spiders/ 目录下新建 zdaye_spider.py 文件
import scrapy
class KdlSpider(scrapy.spiders.Spider):
name = "zdaye"
def start_requests(self):
url = "https://example.com"
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
print(response.status)
myextend.py
添加自定义扩展:在 tutorial/ 目录下新建 myextend.py 文件,调用时只需修改 api_url 以及在 time.sleep 处设置提取IP的间隔时间即可
import time
import threading
import requests
from scrapy import signals
# 提取代理IP的api
api_url = 'http://www.***.com/ShortProxy/GetIP/?api=1234567890&akey=8a17ca305f683620&count=10×pan=3&type=3'
foo = True
class Proxy:
def __init__(self, ):
self._proxy_list=requests.get(api_url).json().get('data').get('proxy_list')
@property
def proxy_list(self):
return self._proxy_list
@proxy_list.setter
def proxy_list(self,list):
self._proxy_list=list
pro=Proxy()
print(pro.proxy_list)
class MyExtend:
def__init__(self, crawler):
self.crawler=crawler
# 将自定义方法绑定到scrapy信号上,使程序与spider引擎同步启动与关闭
# scrapy信号文档: https://www.osgeo.cn/scrapy/topics/signals.html
# scrapy自定义拓展文档: https://www.osgeo.cn/scrapy/topics/extensions.html
crawler.signals.connect(self.start,signals.engine_started)
crawler.signals.connect(self.close,signals.spider_closed)
@classmethod
def from_crawler(cls,crawler):
return cls(crawler)
def start(self):
t=threading.Thread(target=self.extract_proxy)
t.start()
def extract_proxy(self):
while foo:
pro.proxy_list=requests.get(api_url).json().get('data').get('proxy_list')
#设置每15秒提取一次ip
time.sleep(15)
def close(self):
global foo
foo = False
middlewares.py
middlewares.py 中新增 ProxyDownloaderMiddleware 即代理中间件;
需要替换代码中的部分信息:username:用户名,password:密码
from scrapy import signals
from .myextend import pro
import random
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class TutorialSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s=cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class TutorialDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ProxyDownloaderMiddleware:
def process_request(self, request, spider):
proxy = random.choice(pro.proxy_list)
# 用户名密码授权
username = "username"
password = "password"
request.meta['proxy'] = "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": proxy}
# 终端IP授权
# request.meta['proxy'] = "http://%(proxy)s/" % {"proxy": proxy}
return None
settings.py
settings.py中激活ProxyDownloaderMiddleware代理中间件和自定义拓展
BOT_NAME = 'tutorial'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'tutorial.middlewares.ProxyDownloaderMiddleware': 100,
}
LOG_LEVEL = 'WARNING'
EXTENSIONS = {
'tutorial.myextend.MyExtend': 300,
}