How do I emit flask_socketio events during a scrapy crawling process?

34 Views Asked by At

Since crawling takes time, I am trying to build a progress bar on the front-end by tracking the number of items my spider has crawled. The number of items is compared to a capped number of items, at which point a CloseSpider exception is raised.

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import LinkItem

class MySpider(CrawlSpider):
    def __init__(self, allowed_domains=None, start_urls=None, *args, **kwargs):
        super(MySpider, self).__init__(*args, **kwargs)
        
        self.allowed_domains = [allowed_domains]
        self.start_urls = [start_urls]
        self.item_count = 0
        self.max_item_count = 200
        self.loading_unit = self.max_item_count / 100
        self.loading_progress = 0

    name = "crawl"
    
    rules = (Rule(LinkExtractor(), callback='parse_item', follow=True),)

    def update_loading(self):
        if self.item_count % self.loading_unit == 0:
            self.loading_progress = int(self.item_count / self.max_item_count * 100)
            if(self.loading_progress == 100):
                self.loading_progress = 99

    def parse_item(self, response):
        if self.item_count >= self.max_item_count:
            raise scrapy.exceptions.CloseSpider("Reached maximum item count")

        content_type = response.headers.get('Content-Type', b'').decode('utf-8')
        if not content_type.startswith('text/html'):
            return
        
        if "english" in response.url:
            self.item_count += 1
            self.update_loading()

            link_item = LinkItem()
            link_item["url"] = response.url
            link_item["keyword"] = "english"

            print(f"{self.item_count}. Found url: {response.url} with keyword english")

            yield link_item

But the problem is, I haven't been able to send this loading progress on the client-side.

I've tried calling an emit function from the main app.

The spider modification:

    def update_loading(self):
        from app import emit_loading_progress
        
        if self.item_count % self.loading_unit == 0:
            self.loading_progress = int(self.item_count / self.max_item_count * 100)
            if(self.loading_progress == 100):
                self.loading_progress = 99
            emit_loading_progress(self.loading_progress)

And the app.py:

from flask import Flask, render_template
from flask_socketio import SocketIO
import scrapy.crawler as crawler
from crawl.crawl.spiders.crawl import MySpider
from dotenv import load_dotenv
import os
from forms import YourForm
from multiprocessing import Process, Queue
from twisted.internet import reactor
import traceback
from scrapy.utils.project import get_project_settings
import pandas as pd

load_dotenv()

app = Flask(__name__)
app.config['SECRET_KEY'] = os.getenv('SECRET_KEY')

socketio = SocketIO(app)

def f(q, allowed_domains, start_urls):
    try:
        settings = get_project_settings()
        runner = crawler.CrawlerRunner(settings)
        deferred = runner.crawl(MySpider, allowed_domains=allowed_domains, start_urls=start_urls)
        deferred.addBoth(lambda _: reactor.stop())
        reactor.run()
        q.put(None)
    except Exception as e:
        q.put(e)

def run_spider(allowed_domains, start_urls):
    q = Queue()
    p = Process(target=f, args=(q, allowed_domains, start_urls))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

@socketio.on('submit')
def handle_submit(allowed_domains, start_urls):
    run_spider(allowed_domains, start_urls)
    emit_result()

def emit_result():
    file_path = os.path.join(os.path.dirname(__file__), 'output', 'links.jsonl')

    if os.path.exists(file_path):
        data = pd.read_json(path_or_buf=file_path, lines=True)
        json_data = data.to_dict(orient='records')
        socketio.emit('spider_closed', json_data)
        print("SPIDER CLOSED")

def emit_loading_progress(progress):
    socketio.emit('update_loading', progress)

@app.route('/')
def index():
    form = YourForm()
    return render_template('index.html', form=form)

@socketio.on_error()
def handle_error(e):
    print("An error occurred:", e)
    print(traceback.format_exc())

if __name__ == '__main__':
  socketio.run(app, debug=True, log_output=True)

I was expecting that the client would receive the update_loading event, but it's not.

What can I do to track the progress of the spider and send real time updates of it?

0

There are 0 best solutions below