models.py 3.07 KB
import importlib

from django.db import models


class Project(models.Model):
    name = models.CharField(max_length=20, primary_key=True)

    def download_documents(self):
        from scrapy import crawler
        from scrapy.utils import project
        process = crawler.CrawlerProcess(project.get_project_settings())
        for pipeline in self.pipelines.all():
            if pipeline.source.url:
                process.crawl(pipeline.spider, pipeline_name=pipeline.name)
        process.start()

    def load_documents(self):
        for pipeline in self.pipelines.all():
            if pipeline.source.path:
                pipeline.load_documents()

    def extract_documents(self):
        for pipeline in self.pipelines.all():
            pipeline.extract_documents()

    def annotate_documents(self):
        for pipeline in self.pipelines.all():
            pipeline.annotate_documents()

    def write_documents(self):
        for pipeline in self.pipelines.all():
            pipeline.write_documents()

    class Meta:
        db_table = 'project'

    def __str__(self):
        return self.name


class Source(models.Model):
    name = models.CharField(max_length=32, primary_key=True)
    url = models.URLField(blank=True)
    path = models.TextField(blank=True, default='')

    class Meta:
        db_table = 'source'

    def __str__(self):
        return self.name


class Pipeline(models.Model):
    name = models.CharField(max_length=32, primary_key=True)
    project = models.ForeignKey(Project, related_name='pipelines', on_delete=models.PROTECT)
    source = models.ForeignKey(Source, related_name='pipelines', on_delete=models.PROTECT)
    loader = models.CharField(max_length=20, blank=True)
    spider = models.CharField(max_length=20, blank=True)
    extractor = models.CharField(max_length=20, blank=True)
    annotation = models.CharField(max_length=100, blank=True)
    writer = models.CharField(max_length=20, blank=True)

    def load_documents(self):
        loader = importlib.import_module('loaders.%s' % self.loader)
        loader.load_documents(self)

    def extract_documents(self):
        if self.extractor:
            extractor = importlib.import_module('extractors.%s' % self.extractor)
            extractor.extract(self.documents.all())

    def extract(self, document):
        if self.extractor:
            extractor = importlib.import_module('extractors.%s' % self.extractor)
            extractor.extract([document])

    def annotate_documents(self):
        annotation_pipe = importlib.import_module('annotation.%s' % self.annotation)
        annotation_pipe.annotate(self.documents.all())

    def annotate(self, document):
        annotation_pipe = importlib.import_module('annotation.%s' % self.annotation)
        annotation_pipe.annotate([document])

    def write_documents(self):
        for doc in self.documents.all():
            self.write(doc)

    def write(self, document):
        writer = importlib.import_module('writers.%s' % self.writer)
        writer.write(document)

    class Meta:
        db_table = 'pipeline'

    def __str__(self):
        return self.name