变更
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
#@IgnoreInspection BashAddShebang
|
||||
FROM python:2.7-onbuild
|
||||
|
||||
ENTRYPOINT ["scrapy"]
|
||||
CMD ["crawl", "dmoz"]
|
||||
@@ -0,0 +1,154 @@
|
||||
============================
|
||||
Scrapy Redis Example Project
|
||||
============================
|
||||
|
||||
|
||||
This directory contains an example Scrapy project integrated with scrapy-redis.
|
||||
By default, all items are sent to redis (key ``<spider>:items``). All spiders
|
||||
schedule requests through redis, so you can start additional spiders to speed
|
||||
up the crawling.
|
||||
|
||||
Spiders
|
||||
-------
|
||||
|
||||
* **dmoz**
|
||||
|
||||
This spider simply scrapes dmoz.org.
|
||||
|
||||
* **myspider_redis**
|
||||
|
||||
This spider uses redis as a shared requests queue and uses
|
||||
``myspider:start_urls`` as start URLs seed. For each URL, the spider outputs
|
||||
one item.
|
||||
|
||||
* **mycrawler_redis**
|
||||
|
||||
This spider uses redis as a shared requests queue and uses
|
||||
``mycrawler:start_urls`` as start URLs seed. For each URL, the spider follows
|
||||
are links.
|
||||
|
||||
|
||||
.. note::
|
||||
|
||||
All requests are persisted by default. You can clear the queue by using the
|
||||
``SCHEDULER_FLUSH_ON_START`` setting. For example: ``scrapy crawl dmoz -s
|
||||
SCHEDULER_FLUSH_ON_START=1``.
|
||||
|
||||
|
||||
Running the example project
|
||||
---------------------------
|
||||
|
||||
This example illustrates how to share a spider's requests queue
|
||||
across multiple spider instances, highly suitable for broad crawls.
|
||||
|
||||
1. Check scrapy_redis package in your ``PYTHONPATH``
|
||||
|
||||
2. Run the crawler for first time then stop it
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd example-project
|
||||
scrapy crawl dmoz
|
||||
... [dmoz] ...
|
||||
^C
|
||||
|
||||
3. Run the crawler again to resume stopped crawling
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
scrapy crawl dmoz
|
||||
... [dmoz] DEBUG: Resuming crawl (9019 requests scheduled)
|
||||
|
||||
4. Start one or more additional scrapy crawlers
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
scrapy crawl dmoz
|
||||
... [dmoz] DEBUG: Resuming crawl (8712 requests scheduled)
|
||||
|
||||
5. Start one or more post-processing workers
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python process_items.py dmoz:items -v
|
||||
...
|
||||
Processing: Kilani Giftware (http://www.dmoz.org/Computers/Shopping/Gifts/)
|
||||
Processing: NinjaGizmos.com (http://www.dmoz.org/Computers/Shopping/Gifts/)
|
||||
...
|
||||
|
||||
|
||||
Feeding a Spider from Redis
|
||||
---------------------------
|
||||
|
||||
The class ``scrapy_redis.spiders.RedisSpider`` enables a spider to read the
|
||||
urls from redis. The urls in the redis queue will be processed one
|
||||
after another, if the first request yields more requests, the spider
|
||||
will process those requests before fetching another url from redis.
|
||||
|
||||
For example, create a file ``myspider.py`` with the code below:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from scrapy_redis.spiders import RedisSpider
|
||||
|
||||
|
||||
class MySpider(RedisSpider):
|
||||
name = "myspider"
|
||||
|
||||
def parse(self, response):
|
||||
# do stuff
|
||||
pass
|
||||
|
||||
|
||||
Then:
|
||||
|
||||
1. run the spider
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
scrapy runspider myspider.py
|
||||
|
||||
2. push json data to redis
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
redis-cli lpush myspider '{"url": "https://exaple.com", "meta": {"job-id":"123xsd", "start-date":"dd/mm/yy"}, "url_cookie_key":"fertxsas" }'
|
||||
|
||||
|
||||
.. note::
|
||||
|
||||
* These spiders rely on the spider idle signal to fetch start urls, hence it
|
||||
may have a few seconds of delay between the time you push a new url and the
|
||||
spider starts crawling it.
|
||||
|
||||
* Also please pay attention to json formatting.
|
||||
|
||||
|
||||
Processing items
|
||||
----------------
|
||||
|
||||
The ``process_items.py`` provides an example of consuming the items queue::
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python process_items.py --help
|
||||
|
||||
|
||||
Run via Docker
|
||||
--------------
|
||||
|
||||
You require the following applications:
|
||||
|
||||
* docker (https://docs.docker.com/installation/)
|
||||
* docker-compose (https://docs.docker.com/compose/install/)
|
||||
|
||||
For implementation details see `Dockerfile` and `docker-compose.yml` and read
|
||||
official docker documentation.
|
||||
|
||||
1. To start sample `example-project` (`-d` for daemon)::
|
||||
|
||||
docker-compose up
|
||||
|
||||
2. To scale `crawler` (4 instances for example)::
|
||||
|
||||
docker-compose scale crawler=4
|
||||
@@ -0,0 +1,9 @@
|
||||
redis:
|
||||
image: redis
|
||||
ports:
|
||||
- "6379:6379" # added port for external db provisioning
|
||||
|
||||
crawler:
|
||||
build: .
|
||||
links:
|
||||
- redis:localhost
|
||||
@@ -0,0 +1,24 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# http://doc.scrapy.org/topics/items.html
|
||||
|
||||
from scrapy.item import Field, Item
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.loader.processors import Join, MapCompose, TakeFirst
|
||||
|
||||
|
||||
class ExampleItem(Item):
|
||||
name = Field()
|
||||
description = Field()
|
||||
link = Field()
|
||||
crawled = Field()
|
||||
spider = Field()
|
||||
url = Field()
|
||||
|
||||
|
||||
class ExampleLoader(ItemLoader):
|
||||
default_item_class = ExampleItem
|
||||
default_input_processor = MapCompose(lambda s: s.strip())
|
||||
default_output_processor = TakeFirst()
|
||||
description_out = Join()
|
||||
@@ -0,0 +1,12 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/topics/item-pipeline.html
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class ExamplePipeline:
|
||||
def process_item(self, item, spider):
|
||||
item["crawled"] = datetime.utcnow()
|
||||
item["spider"] = spider.name
|
||||
return item
|
||||
@@ -0,0 +1,37 @@
|
||||
# Scrapy settings for example project
|
||||
#
|
||||
# For simplicity, this file contains only the most important settings by
|
||||
# default. All the other settings are documented here:
|
||||
#
|
||||
# http://doc.scrapy.org/topics/settings.html
|
||||
#
|
||||
SPIDER_MODULES = ["example.spiders"]
|
||||
NEWSPIDER_MODULE = "example.spiders"
|
||||
|
||||
LOG_LEVEL = "WARNING"
|
||||
|
||||
USER_AGENT = "scrapy-redis (+https://github.com/rolando/scrapy-redis)"
|
||||
|
||||
#设置重复过滤器模块
|
||||
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
|
||||
#设置调度器,scrapy_redis具备与数据库交互的功能
|
||||
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
|
||||
#设置当爬虫结束时是否保持redis数据库中的去重集合与任务队列
|
||||
SCHEDULER_PERSIST = True
|
||||
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
|
||||
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
|
||||
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
|
||||
|
||||
ITEM_PIPELINES = {
|
||||
"example.pipelines.ExamplePipeline": 300,
|
||||
#当开启该管道,该管道将会把数据存到redis数据库中
|
||||
"scrapy_redis.pipelines.RedisPipeline": 400,
|
||||
}
|
||||
#设置redis数据库
|
||||
REDIS_URL = "redis://127.0.0.1:6379"
|
||||
|
||||
LOG_LEVEL = "DEBUG"
|
||||
|
||||
# Introduce an artifical delay to make use of parallelism. to speed up the
|
||||
# crawl.
|
||||
DOWNLOAD_DELAY = 1
|
||||
@@ -0,0 +1,8 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# To create the first spider for your project use this command:
|
||||
#
|
||||
# scrapy genspider myspider myspider-domain.com
|
||||
#
|
||||
# For more info see:
|
||||
# http://doc.scrapy.org/topics/spiders.html
|
||||
@@ -0,0 +1,26 @@
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import CrawlSpider, Rule
|
||||
|
||||
|
||||
class DmozSpider(CrawlSpider):
|
||||
"""Follow categories and extract links."""
|
||||
|
||||
name = "dmoz"
|
||||
allowed_domains = ["dmoztools.net"]
|
||||
start_urls = ["http://www.dmoztools.net/"]
|
||||
|
||||
rules = [
|
||||
Rule(
|
||||
LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")),
|
||||
callback="parse_directory",
|
||||
follow=True,
|
||||
),
|
||||
]
|
||||
|
||||
def parse_directory(self, response):
|
||||
for div in response.css(".title-and-desc"):
|
||||
yield {
|
||||
"name": div.css(".site-title::text").extract_first(),
|
||||
"description": div.css(".site-descr::text").extract_first().strip(),
|
||||
"link": div.css("a::attr(href)").extract_first(),
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import Rule
|
||||
|
||||
from scrapy_redis.spiders import RedisCrawlSpider
|
||||
|
||||
|
||||
class MyCrawler(RedisCrawlSpider):
|
||||
"""Spider that reads urls from redis queue (myspider:start_urls)."""
|
||||
|
||||
name = "mycrawler_redis"
|
||||
redis_key = "mycrawler:start_urls"
|
||||
|
||||
rules = (
|
||||
# follow all links
|
||||
Rule(LinkExtractor(), callback="parse_page", follow=True),
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Dynamically define the allowed domains list.
|
||||
domain = kwargs.pop("domain", "")
|
||||
self.allowed_domains = filter(None, domain.split(","))
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def parse_page(self, response):
|
||||
return {
|
||||
"name": response.css("title::text").extract_first(),
|
||||
"url": response.url,
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
from scrapy_redis.spiders import RedisSpider
|
||||
|
||||
|
||||
class MySpider(RedisSpider):
|
||||
"""Spider that reads urls from redis queue (myspider:start_urls)."""
|
||||
|
||||
name = "myspider_redis"
|
||||
redis_key = "myspider:start_urls"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Dynamically define the allowed domains list.
|
||||
domain = kwargs.pop("domain", "")
|
||||
self.allowed_domains = filter(None, domain.split(","))
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def parse(self, response):
|
||||
return {
|
||||
"name": response.css("title::text").extract_first(),
|
||||
"url": response.url,
|
||||
}
|
||||
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
"""A script to process items from a redis queue."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import pprint
|
||||
import sys
|
||||
import time
|
||||
|
||||
from scrapy_redis import get_redis
|
||||
|
||||
logger = logging.getLogger("process_items")
|
||||
|
||||
|
||||
def process_items(r, keys, timeout, limit=0, log_every=1000, wait=0.1):
|
||||
"""Process items from a redis queue.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
r : Redis
|
||||
Redis connection instance.
|
||||
keys : list
|
||||
List of keys to read the items from.
|
||||
timeout: int
|
||||
Read timeout.
|
||||
|
||||
"""
|
||||
limit = limit or float("inf")
|
||||
processed = 0
|
||||
while processed < limit:
|
||||
# Change ``blpop`` to ``brpop`` to process as LIFO.
|
||||
ret = r.blpop(keys, timeout)
|
||||
# If data is found before the timeout then we consider we are done.
|
||||
if ret is None:
|
||||
time.sleep(wait)
|
||||
continue
|
||||
|
||||
source, data = ret
|
||||
try:
|
||||
item = json.loads(data)
|
||||
except Exception:
|
||||
logger.exception("Failed to load item:\n%r", pprint.pformat(data))
|
||||
continue
|
||||
|
||||
try:
|
||||
name = item.get("name") or item.get("title")
|
||||
url = item.get("url") or item.get("link")
|
||||
logger.debug("[%s] Processing item: %s <%s>", source, name, url)
|
||||
except KeyError:
|
||||
logger.exception(
|
||||
"[%s] Failed to process item:\n%r", source, pprint.pformat(item)
|
||||
)
|
||||
continue
|
||||
|
||||
processed += 1
|
||||
if processed % log_every == 0:
|
||||
logger.info("Processed %s items", processed)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("key", help="Redis key where items are stored")
|
||||
parser.add_argument("--host")
|
||||
parser.add_argument("--port")
|
||||
parser.add_argument("--timeout", type=int, default=5)
|
||||
parser.add_argument("--limit", type=int, default=0)
|
||||
parser.add_argument("--progress-every", type=int, default=100)
|
||||
parser.add_argument("-v", "--verbose", action="store_true")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
params = {}
|
||||
if args.host:
|
||||
params["host"] = args.host
|
||||
if args.port:
|
||||
params["port"] = args.port
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
r = get_redis(**params)
|
||||
host = r.connection_pool.get_connection("info").host
|
||||
logger.info("Waiting for items in '%s' (server: %s)", args.key, host)
|
||||
kwargs = {
|
||||
"keys": [args.key],
|
||||
"timeout": args.timeout,
|
||||
"limit": args.limit,
|
||||
"log_every": args.progress_every,
|
||||
}
|
||||
try:
|
||||
process_items(r, **kwargs)
|
||||
retcode = 0 # ok
|
||||
except KeyboardInterrupt:
|
||||
retcode = 0 # ok
|
||||
except Exception:
|
||||
logger.exception("Unhandled exception")
|
||||
retcode = 2
|
||||
|
||||
return retcode
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,2 @@
|
||||
scrapy
|
||||
scrapy-redis
|
||||
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# http://doc.scrapy.org/topics/scrapyd.html
|
||||
|
||||
[settings]
|
||||
default = example.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = example
|
||||
Reference in New Issue
Block a user