变更

2025-08-05 09:19:34 +08:00
commit 584548d006
1696 changed files with 53855 additions and 0 deletions
@@ -0,0 +1,5 @@
+#@IgnoreInspection BashAddShebang
+FROM python:2.7-onbuild
+
+ENTRYPOINT ["scrapy"]
+CMD ["crawl", "dmoz"]
@@ -0,0 +1,154 @@
+============================
+Scrapy Redis Example Project
+============================
+
+
+This directory contains an example Scrapy project integrated with scrapy-redis.
+By default, all items are sent to redis (key ``<spider>:items``). All spiders
+schedule requests through redis, so you can start additional spiders to speed
+up the crawling.
+
+Spiders
+-------
+
+* **dmoz**
+
+  This spider simply scrapes dmoz.org.
+
+* **myspider_redis**
+
+  This spider uses redis as a shared requests queue and uses
+  ``myspider:start_urls`` as start URLs seed. For each URL, the spider outputs
+  one item.
+
+* **mycrawler_redis**
+
+  This spider uses redis as a shared requests queue and uses
+  ``mycrawler:start_urls`` as start URLs seed. For each URL, the spider follows
+  are links.
+
+
+.. note::
+
+    All requests are persisted by default. You can clear the queue by using the
+    ``SCHEDULER_FLUSH_ON_START`` setting. For example: ``scrapy crawl dmoz -s
+    SCHEDULER_FLUSH_ON_START=1``.
+
+
+Running the example project
+---------------------------
+
+This example illustrates how to share a spider's requests queue
+across multiple spider instances, highly suitable for broad crawls.
+
+1. Check scrapy_redis package in your ``PYTHONPATH``
+
+2. Run the crawler for first time then stop it
+
+.. code-block:: bash
+
+    cd example-project
+    scrapy crawl dmoz
+    ... [dmoz] ...
+    ^C
+
+3. Run the crawler again to resume stopped crawling
+
+.. code-block:: bash
+
+    scrapy crawl dmoz
+    ... [dmoz] DEBUG: Resuming crawl (9019 requests scheduled)
+
+4. Start one or more additional scrapy crawlers
+
+.. code-block:: bash
+
+    scrapy crawl dmoz
+    ... [dmoz] DEBUG: Resuming crawl (8712 requests scheduled)
+
+5. Start one or more post-processing workers
+
+.. code-block:: bash
+
+    python process_items.py dmoz:items -v
+    ...
+    Processing: Kilani Giftware (http://www.dmoz.org/Computers/Shopping/Gifts/)
+    Processing: NinjaGizmos.com (http://www.dmoz.org/Computers/Shopping/Gifts/)
+    ...
+
+
+Feeding a Spider from Redis
+---------------------------
+
+The class ``scrapy_redis.spiders.RedisSpider`` enables a spider to read the
+urls from redis. The urls in the redis queue will be processed one
+after another, if the first request yields more requests, the spider
+will process those requests before fetching another url from redis.
+
+For example, create a file ``myspider.py`` with the code below:
+
+.. code-block:: python
+
+    from scrapy_redis.spiders import RedisSpider
+
+
+    class MySpider(RedisSpider):
+        name = "myspider"
+
+        def parse(self, response):
+            # do stuff
+            pass
+
+
+Then:
+
+1. run the spider
+
+.. code-block:: bash
+
+    scrapy runspider myspider.py
+
+2. push json data to redis
+
+.. code-block:: bash
+
+    redis-cli lpush myspider '{"url": "https://exaple.com", "meta": {"job-id":"123xsd", "start-date":"dd/mm/yy"}, "url_cookie_key":"fertxsas" }'
+
+
+.. note::
+
+    * These spiders rely on the spider idle signal to fetch start urls, hence it
+    may have a few seconds of delay between the time you push a new url and the
+    spider starts crawling it.
+
+    * Also please pay attention to json formatting.
+
+
+Processing items
+----------------
+
+The ``process_items.py`` provides an example of consuming the items queue::
+
+.. code-block:: bash
+
+    python process_items.py --help
+
+
+Run via Docker
+--------------
+
+You require the following applications:
+
+* docker (https://docs.docker.com/installation/)
+* docker-compose (https://docs.docker.com/compose/install/)
+
+For implementation details see `Dockerfile` and `docker-compose.yml` and read
+official docker documentation.
+
+1. To start sample `example-project` (`-d` for daemon)::
+
+    docker-compose up
+
+2. To scale `crawler` (4 instances for example)::
+
+    docker-compose scale crawler=4
@@ -0,0 +1,9 @@
+redis:
+  image: redis
+  ports:
+   - "6379:6379" # added port for external db provisioning
+
+crawler:
+  build: .
+  links:
+    - redis:localhost
@@ -0,0 +1,24 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/topics/items.html
+
+from scrapy.item import Field, Item
+from scrapy.loader import ItemLoader
+from scrapy.loader.processors import Join, MapCompose, TakeFirst
+
+
+class ExampleItem(Item):
+    name = Field()
+    description = Field()
+    link = Field()
+    crawled = Field()
+    spider = Field()
+    url = Field()
+
+
+class ExampleLoader(ItemLoader):
+    default_item_class = ExampleItem
+    default_input_processor = MapCompose(lambda s: s.strip())
+    default_output_processor = TakeFirst()
+    description_out = Join()
@@ -0,0 +1,12 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/topics/item-pipeline.html
+from datetime import datetime
+
+
+class ExamplePipeline:
+    def process_item(self, item, spider):
+        item["crawled"] = datetime.utcnow()
+        item["spider"] = spider.name
+        return item
@@ -0,0 +1,37 @@
+# Scrapy settings for example project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/topics/settings.html
+#
+SPIDER_MODULES = ["example.spiders"]
+NEWSPIDER_MODULE = "example.spiders"
+
+LOG_LEVEL = "WARNING"
+
+USER_AGENT = "scrapy-redis (+https://github.com/rolando/scrapy-redis)"
+
+#设置重复过滤器模块
+DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
+#设置调度器，scrapy_redis具备与数据库交互的功能
+SCHEDULER = "scrapy_redis.scheduler.Scheduler"
+#设置当爬虫结束时是否保持redis数据库中的去重集合与任务队列
+SCHEDULER_PERSIST = True
+# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
+# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
+# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
+
+ITEM_PIPELINES = {
+    "example.pipelines.ExamplePipeline": 300,
+    #当开启该管道，该管道将会把数据存到redis数据库中
+    "scrapy_redis.pipelines.RedisPipeline": 400,
+}
+#设置redis数据库
+REDIS_URL = "redis://127.0.0.1:6379"
+
+LOG_LEVEL = "DEBUG"
+
+# Introduce an artifical delay to make use of parallelism. to speed up the
+# crawl.
+DOWNLOAD_DELAY = 1
@@ -0,0 +1,8 @@
+# This package will contain the spiders of your Scrapy project
+#
+# To create the first spider for your project use this command:
+#
+#   scrapy genspider myspider myspider-domain.com
+#
+# For more info see:
+# http://doc.scrapy.org/topics/spiders.html
@@ -0,0 +1,26 @@
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import CrawlSpider, Rule
+
+
+class DmozSpider(CrawlSpider):
+    """Follow categories and extract links."""
+
+    name = "dmoz"
+    allowed_domains = ["dmoztools.net"]
+    start_urls = ["http://www.dmoztools.net/"]
+
+    rules = [
+        Rule(
+            LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")),
+            callback="parse_directory",
+            follow=True,
+        ),
+    ]
+
+    def parse_directory(self, response):
+        for div in response.css(".title-and-desc"):
+            yield {
+                "name": div.css(".site-title::text").extract_first(),
+                "description": div.css(".site-descr::text").extract_first().strip(),
+                "link": div.css("a::attr(href)").extract_first(),
+            }
@@ -0,0 +1,28 @@
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import Rule
+
+from scrapy_redis.spiders import RedisCrawlSpider
+
+
+class MyCrawler(RedisCrawlSpider):
+    """Spider that reads urls from redis queue (myspider:start_urls)."""
+
+    name = "mycrawler_redis"
+    redis_key = "mycrawler:start_urls"
+
+    rules = (
+        # follow all links
+        Rule(LinkExtractor(), callback="parse_page", follow=True),
+    )
+
+    def __init__(self, *args, **kwargs):
+        # Dynamically define the allowed domains list.
+        domain = kwargs.pop("domain", "")
+        self.allowed_domains = filter(None, domain.split(","))
+        super().__init__(*args, **kwargs)
+
+    def parse_page(self, response):
+        return {
+            "name": response.css("title::text").extract_first(),
+            "url": response.url,
+        }
@@ -0,0 +1,20 @@
+from scrapy_redis.spiders import RedisSpider
+
+
+class MySpider(RedisSpider):
+    """Spider that reads urls from redis queue (myspider:start_urls)."""
+
+    name = "myspider_redis"
+    redis_key = "myspider:start_urls"
+
+    def __init__(self, *args, **kwargs):
+        # Dynamically define the allowed domains list.
+        domain = kwargs.pop("domain", "")
+        self.allowed_domains = filter(None, domain.split(","))
+        super().__init__(*args, **kwargs)
+
+    def parse(self, response):
+        return {
+            "name": response.css("title::text").extract_first(),
+            "url": response.url,
+        }
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+
+# -*- coding: utf-8 -*-
+"""A script to process items from a redis queue."""
+
+import argparse
+import json
+import logging
+import pprint
+import sys
+import time
+
+from scrapy_redis import get_redis
+
+logger = logging.getLogger("process_items")
+
+
+def process_items(r, keys, timeout, limit=0, log_every=1000, wait=0.1):
+    """Process items from a redis queue.
+
+    Parameters
+    ----------
+    r : Redis
+        Redis connection instance.
+    keys : list
+        List of keys to read the items from.
+    timeout: int
+        Read timeout.
+
+    """
+    limit = limit or float("inf")
+    processed = 0
+    while processed < limit:
+        # Change ``blpop`` to ``brpop`` to process as LIFO.
+        ret = r.blpop(keys, timeout)
+        # If data is found before the timeout then we consider we are done.
+        if ret is None:
+            time.sleep(wait)
+            continue
+
+        source, data = ret
+        try:
+            item = json.loads(data)
+        except Exception:
+            logger.exception("Failed to load item:\n%r", pprint.pformat(data))
+            continue
+
+        try:
+            name = item.get("name") or item.get("title")
+            url = item.get("url") or item.get("link")
+            logger.debug("[%s] Processing item: %s <%s>", source, name, url)
+        except KeyError:
+            logger.exception(
+                "[%s] Failed to process item:\n%r", source, pprint.pformat(item)
+            )
+            continue
+
+        processed += 1
+        if processed % log_every == 0:
+            logger.info("Processed %s items", processed)
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("key", help="Redis key where items are stored")
+    parser.add_argument("--host")
+    parser.add_argument("--port")
+    parser.add_argument("--timeout", type=int, default=5)
+    parser.add_argument("--limit", type=int, default=0)
+    parser.add_argument("--progress-every", type=int, default=100)
+    parser.add_argument("-v", "--verbose", action="store_true")
+
+    args = parser.parse_args()
+
+    params = {}
+    if args.host:
+        params["host"] = args.host
+    if args.port:
+        params["port"] = args.port
+
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+
+    r = get_redis(**params)
+    host = r.connection_pool.get_connection("info").host
+    logger.info("Waiting for items in '%s' (server: %s)", args.key, host)
+    kwargs = {
+        "keys": [args.key],
+        "timeout": args.timeout,
+        "limit": args.limit,
+        "log_every": args.progress_every,
+    }
+    try:
+        process_items(r, **kwargs)
+        retcode = 0  # ok
+    except KeyboardInterrupt:
+        retcode = 0  # ok
+    except Exception:
+        logger.exception("Unhandled exception")
+        retcode = 2
+
+    return retcode
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,2 @@
+scrapy
+scrapy-redis
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/topics/scrapyd.html
+
+[settings]
+default = example.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = example