This commit is contained in:
2025-08-05 09:19:34 +08:00
commit 584548d006
1696 changed files with 53855 additions and 0 deletions
@@ -0,0 +1,5 @@
#@IgnoreInspection BashAddShebang
FROM python:2.7-onbuild
ENTRYPOINT ["scrapy"]
CMD ["crawl", "dmoz"]
@@ -0,0 +1,154 @@
============================
Scrapy Redis Example Project
============================
This directory contains an example Scrapy project integrated with scrapy-redis.
By default, all items are sent to redis (key ``<spider>:items``). All spiders
schedule requests through redis, so you can start additional spiders to speed
up the crawling.
Spiders
-------
* **dmoz**
This spider simply scrapes dmoz.org.
* **myspider_redis**
This spider uses redis as a shared requests queue and uses
``myspider:start_urls`` as start URLs seed. For each URL, the spider outputs
one item.
* **mycrawler_redis**
This spider uses redis as a shared requests queue and uses
``mycrawler:start_urls`` as start URLs seed. For each URL, the spider follows
are links.
.. note::
All requests are persisted by default. You can clear the queue by using the
``SCHEDULER_FLUSH_ON_START`` setting. For example: ``scrapy crawl dmoz -s
SCHEDULER_FLUSH_ON_START=1``.
Running the example project
---------------------------
This example illustrates how to share a spider's requests queue
across multiple spider instances, highly suitable for broad crawls.
1. Check scrapy_redis package in your ``PYTHONPATH``
2. Run the crawler for first time then stop it
.. code-block:: bash
cd example-project
scrapy crawl dmoz
... [dmoz] ...
^C
3. Run the crawler again to resume stopped crawling
.. code-block:: bash
scrapy crawl dmoz
... [dmoz] DEBUG: Resuming crawl (9019 requests scheduled)
4. Start one or more additional scrapy crawlers
.. code-block:: bash
scrapy crawl dmoz
... [dmoz] DEBUG: Resuming crawl (8712 requests scheduled)
5. Start one or more post-processing workers
.. code-block:: bash
python process_items.py dmoz:items -v
...
Processing: Kilani Giftware (http://www.dmoz.org/Computers/Shopping/Gifts/)
Processing: NinjaGizmos.com (http://www.dmoz.org/Computers/Shopping/Gifts/)
...
Feeding a Spider from Redis
---------------------------
The class ``scrapy_redis.spiders.RedisSpider`` enables a spider to read the
urls from redis. The urls in the redis queue will be processed one
after another, if the first request yields more requests, the spider
will process those requests before fetching another url from redis.
For example, create a file ``myspider.py`` with the code below:
.. code-block:: python
from scrapy_redis.spiders import RedisSpider
class MySpider(RedisSpider):
name = "myspider"
def parse(self, response):
# do stuff
pass
Then:
1. run the spider
.. code-block:: bash
scrapy runspider myspider.py
2. push json data to redis
.. code-block:: bash
redis-cli lpush myspider '{"url": "https://exaple.com", "meta": {"job-id":"123xsd", "start-date":"dd/mm/yy"}, "url_cookie_key":"fertxsas" }'
.. note::
* These spiders rely on the spider idle signal to fetch start urls, hence it
may have a few seconds of delay between the time you push a new url and the
spider starts crawling it.
* Also please pay attention to json formatting.
Processing items
----------------
The ``process_items.py`` provides an example of consuming the items queue::
.. code-block:: bash
python process_items.py --help
Run via Docker
--------------
You require the following applications:
* docker (https://docs.docker.com/installation/)
* docker-compose (https://docs.docker.com/compose/install/)
For implementation details see `Dockerfile` and `docker-compose.yml` and read
official docker documentation.
1. To start sample `example-project` (`-d` for daemon)::
docker-compose up
2. To scale `crawler` (4 instances for example)::
docker-compose scale crawler=4
@@ -0,0 +1,9 @@
redis:
image: redis
ports:
- "6379:6379" # added port for external db provisioning
crawler:
build: .
links:
- redis:localhost
@@ -0,0 +1,24 @@
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Field, Item
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, MapCompose, TakeFirst
class ExampleItem(Item):
name = Field()
description = Field()
link = Field()
crawled = Field()
spider = Field()
url = Field()
class ExampleLoader(ItemLoader):
default_item_class = ExampleItem
default_input_processor = MapCompose(lambda s: s.strip())
default_output_processor = TakeFirst()
description_out = Join()
@@ -0,0 +1,12 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/topics/item-pipeline.html
from datetime import datetime
class ExamplePipeline:
def process_item(self, item, spider):
item["crawled"] = datetime.utcnow()
item["spider"] = spider.name
return item
@@ -0,0 +1,37 @@
# Scrapy settings for example project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/topics/settings.html
#
SPIDER_MODULES = ["example.spiders"]
NEWSPIDER_MODULE = "example.spiders"
LOG_LEVEL = "WARNING"
USER_AGENT = "scrapy-redis (+https://github.com/rolando/scrapy-redis)"
#设置重复过滤器模块
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#设置调度器,scrapy_redis具备与数据库交互的功能
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#设置当爬虫结束时是否保持redis数据库中的去重集合与任务队列
SCHEDULER_PERSIST = True
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
ITEM_PIPELINES = {
"example.pipelines.ExamplePipeline": 300,
#当开启该管道,该管道将会把数据存到redis数据库中
"scrapy_redis.pipelines.RedisPipeline": 400,
}
#设置redis数据库
REDIS_URL = "redis://127.0.0.1:6379"
LOG_LEVEL = "DEBUG"
# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
DOWNLOAD_DELAY = 1
@@ -0,0 +1,8 @@
# This package will contain the spiders of your Scrapy project
#
# To create the first spider for your project use this command:
#
# scrapy genspider myspider myspider-domain.com
#
# For more info see:
# http://doc.scrapy.org/topics/spiders.html
@@ -0,0 +1,26 @@
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class DmozSpider(CrawlSpider):
"""Follow categories and extract links."""
name = "dmoz"
allowed_domains = ["dmoztools.net"]
start_urls = ["http://www.dmoztools.net/"]
rules = [
Rule(
LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")),
callback="parse_directory",
follow=True,
),
]
def parse_directory(self, response):
for div in response.css(".title-and-desc"):
yield {
"name": div.css(".site-title::text").extract_first(),
"description": div.css(".site-descr::text").extract_first().strip(),
"link": div.css("a::attr(href)").extract_first(),
}
@@ -0,0 +1,28 @@
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
from scrapy_redis.spiders import RedisCrawlSpider
class MyCrawler(RedisCrawlSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = "mycrawler_redis"
redis_key = "mycrawler:start_urls"
rules = (
# follow all links
Rule(LinkExtractor(), callback="parse_page", follow=True),
)
def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
domain = kwargs.pop("domain", "")
self.allowed_domains = filter(None, domain.split(","))
super().__init__(*args, **kwargs)
def parse_page(self, response):
return {
"name": response.css("title::text").extract_first(),
"url": response.url,
}
@@ -0,0 +1,20 @@
from scrapy_redis.spiders import RedisSpider
class MySpider(RedisSpider):
"""Spider that reads urls from redis queue (myspider:start_urls)."""
name = "myspider_redis"
redis_key = "myspider:start_urls"
def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
domain = kwargs.pop("domain", "")
self.allowed_domains = filter(None, domain.split(","))
super().__init__(*args, **kwargs)
def parse(self, response):
return {
"name": response.css("title::text").extract_first(),
"url": response.url,
}
@@ -0,0 +1,105 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""A script to process items from a redis queue."""
import argparse
import json
import logging
import pprint
import sys
import time
from scrapy_redis import get_redis
logger = logging.getLogger("process_items")
def process_items(r, keys, timeout, limit=0, log_every=1000, wait=0.1):
"""Process items from a redis queue.
Parameters
----------
r : Redis
Redis connection instance.
keys : list
List of keys to read the items from.
timeout: int
Read timeout.
"""
limit = limit or float("inf")
processed = 0
while processed < limit:
# Change ``blpop`` to ``brpop`` to process as LIFO.
ret = r.blpop(keys, timeout)
# If data is found before the timeout then we consider we are done.
if ret is None:
time.sleep(wait)
continue
source, data = ret
try:
item = json.loads(data)
except Exception:
logger.exception("Failed to load item:\n%r", pprint.pformat(data))
continue
try:
name = item.get("name") or item.get("title")
url = item.get("url") or item.get("link")
logger.debug("[%s] Processing item: %s <%s>", source, name, url)
except KeyError:
logger.exception(
"[%s] Failed to process item:\n%r", source, pprint.pformat(item)
)
continue
processed += 1
if processed % log_every == 0:
logger.info("Processed %s items", processed)
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("key", help="Redis key where items are stored")
parser.add_argument("--host")
parser.add_argument("--port")
parser.add_argument("--timeout", type=int, default=5)
parser.add_argument("--limit", type=int, default=0)
parser.add_argument("--progress-every", type=int, default=100)
parser.add_argument("-v", "--verbose", action="store_true")
args = parser.parse_args()
params = {}
if args.host:
params["host"] = args.host
if args.port:
params["port"] = args.port
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
r = get_redis(**params)
host = r.connection_pool.get_connection("info").host
logger.info("Waiting for items in '%s' (server: %s)", args.key, host)
kwargs = {
"keys": [args.key],
"timeout": args.timeout,
"limit": args.limit,
"log_every": args.progress_every,
}
try:
process_items(r, **kwargs)
retcode = 0 # ok
except KeyboardInterrupt:
retcode = 0 # ok
except Exception:
logger.exception("Unhandled exception")
retcode = 2
return retcode
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,2 @@
scrapy
scrapy-redis
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# http://doc.scrapy.org/topics/scrapyd.html
[settings]
default = example.settings
[deploy]
#url = http://localhost:6800/
project = example