models.py
3.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import importlib
from django.db import models
class Project(models.Model):
name = models.CharField(max_length=20, primary_key=True)
def download_documents(self):
from scrapy import crawler
from scrapy.utils import project
process = crawler.CrawlerProcess(project.get_project_settings())
for pipeline in self.pipelines.all():
if pipeline.source.url:
process.crawl(pipeline.spider, pipeline_name=pipeline.name)
process.start()
def load_documents(self):
for pipeline in self.pipelines.all():
if pipeline.source.path:
pipeline.load_documents()
def extract_documents(self):
for pipeline in self.pipelines.all():
pipeline.extract_documents()
def annotate_documents(self):
for pipeline in self.pipelines.all():
pipeline.annotate_documents()
def write_documents(self):
for pipeline in self.pipelines.all():
pipeline.write_documents()
class Meta:
db_table = 'project'
def __str__(self):
return self.name
class Source(models.Model):
name = models.CharField(max_length=32, primary_key=True)
url = models.URLField(blank=True)
path = models.TextField(blank=True, default='')
class Meta:
db_table = 'source'
def __str__(self):
return self.name
class Pipeline(models.Model):
name = models.CharField(max_length=32, primary_key=True)
project = models.ForeignKey(Project, related_name='pipelines', on_delete=models.PROTECT)
source = models.ForeignKey(Source, related_name='pipelines', on_delete=models.PROTECT)
loader = models.CharField(max_length=20, blank=True)
spider = models.CharField(max_length=20, blank=True)
extractor = models.CharField(max_length=20, blank=True)
annotation = models.CharField(max_length=100, blank=True)
writer = models.CharField(max_length=20, blank=True)
def load_documents(self):
loader = importlib.import_module('loaders.%s' % self.loader)
loader.load_documents(self)
def extract_documents(self):
if self.extractor:
extractor = importlib.import_module('extractors.%s' % self.extractor)
extractor.extract(self.documents.all())
def extract(self, document):
if self.extractor:
extractor = importlib.import_module('extractors.%s' % self.extractor)
extractor.extract([document])
def annotate_documents(self):
annotation_pipe = importlib.import_module('annotation.%s' % self.annotation)
annotation_pipe.annotate(self.documents.all())
def annotate(self, document):
annotation_pipe = importlib.import_module('annotation.%s' % self.annotation)
annotation_pipe.annotate([document])
def write_documents(self):
for doc in self.documents.all():
self.write(doc)
def write(self, document):
writer = importlib.import_module('writers.%s' % self.writer)
writer.write(document)
class Meta:
db_table = 'pipeline'
def __str__(self):
return self.name