Commit badd76e0fb3222130c51d1beb7bf3c31365365bb
1 parent
a78e298f
Fix snt_committees downloader.
Showing
3 changed files
with
23 additions
and
27 deletions
collector/downloader/spiders/snt_committees.py
... | ... | @@ -48,15 +48,14 @@ class SenatCommitteesSpider(scrapy.Spider): |
48 | 48 | def _get_committees(self): |
49 | 49 | committees = {} |
50 | 50 | |
51 | - committees_html = requests.post(urljoin(self.pipeline.source.url, 'prace/komisje-senackie'), | |
52 | - headers={'user-agent': scrapy_settings.USER_AGENT}, | |
53 | - data={'kadencja': str(self._term_no)}) | |
54 | - response = scrapy.http.HtmlResponse(url=self.pipeline.source.url, body=committees_html.content) | |
51 | + committees_url = urljoin(self.pipeline.source.url, f'prace/komisje-senackie/?kadencja={self._term_no}') | |
52 | + committees_html = requests.post(committees_url, headers={'user-agent': scrapy_settings.USER_AGENT}) | |
53 | + response = scrapy.http.HtmlResponse(url=committees_url, body=committees_html.content) | |
55 | 54 | |
56 | 55 | committees_divs = response.xpath('.//div[@class="nazwa-komisji"]') |
57 | 56 | for div in committees_divs: |
58 | 57 | committee_url_parts = div.xpath('.//a/@href').extract_first().split(',') |
59 | - committee_name = div.xpath('.//a/text()').extract_first() | |
58 | + committee_name = div.xpath('.//a/span[@class="pseudo-link"]/text()').extract_first() | |
60 | 59 | committees[committee_name] = {'url': committee_url_parts[-1], |
61 | 60 | 'id': committee_url_parts[-2], |
62 | 61 | 'abbrev': utils.get_committee_abbrev('senat', committee_name)} |
... | ... | @@ -72,7 +71,7 @@ class SenatCommitteesSpider(scrapy.Spider): |
72 | 71 | if meta['finished']: |
73 | 72 | return |
74 | 73 | |
75 | - sittings = response.xpath('.//table[@class="tabela-posiedzenia"]/tbody/tr[not(@class="header")]') | |
74 | + sittings = response.xpath('.//table[@class="tabela-posiedzenia-komisji"]/tbody/tr') | |
76 | 75 | self._save_documents(meta, sittings) |
77 | 76 | |
78 | 77 | meta['page'] += 1 |
... | ... | @@ -81,25 +80,25 @@ class SenatCommitteesSpider(scrapy.Spider): |
81 | 80 | |
82 | 81 | def _save_documents(self, meta, documents): |
83 | 82 | for doc in documents: |
83 | + doc_info = doc.xpath('.//td') | |
84 | 84 | |
85 | - sitting_no = doc.xpath('.//td[@class="pierwsza"]/text()').extract_first().strip() | |
86 | - | |
85 | + sitting_no = doc_info[0].xpath('text()').extract_first().strip() | |
87 | 86 | if sitting_no == '1' or (self._stop_sitting_no and int(sitting_no) <= self._stop_sitting_no): |
88 | 87 | meta['finished'] = True |
89 | 88 | |
90 | 89 | if self._committee_fully_downloaded(sitting_no, meta): |
91 | 90 | continue |
92 | 91 | |
93 | - dates = self._get_dates(doc) | |
92 | + dates = self._get_dates(doc_info[1]) | |
94 | 93 | |
95 | - pdf_url = doc.xpath('.//td[@class="czwarta"]/div[@class="area-right"]/' | |
96 | - 'div[@class="posiedzenia-komisje-stenogram"]/' | |
97 | - 'a[text()="Stenogram - plik pdf"]/@href').extract_first() | |
94 | + pdf_url = doc_info[2].xpath('.//div[@class="area-right"]/div[@class="posiedzenia-komisje-stenogram"]/' | |
95 | + 'a[contains(text(), "Stenogram - plik pdf")]/@href').extract_first() | |
98 | 96 | if pdf_url: |
99 | 97 | pdf_url = urljoin(self.pipeline.source.url, pdf_url) |
100 | 98 | |
101 | - html_url = doc.xpath('.//td[@class="czwarta"]/div[@class="area-right"]/' | |
102 | - 'a[@class="stenogram-link"]/@href').extract_first() | |
99 | + html_url = doc_info[2].xpath('./div[@class="area-right"]/div[@class="posiedzenia-komisje-stenogram"]/' | |
100 | + 'a[contains(text(), "Stenogram - wersja html")]/@href').extract_first() | |
101 | + | |
103 | 102 | if html_url: |
104 | 103 | html_url = urljoin(self.pipeline.source.url, html_url) |
105 | 104 | self._save_document(html_url, pdf_url, dates) |
... | ... | @@ -116,9 +115,9 @@ class SenatCommitteesSpider(scrapy.Spider): |
116 | 115 | |
117 | 116 | return True |
118 | 117 | |
119 | - def _get_dates(self, sitting_row): | |
118 | + def _get_dates(self, dates_td): | |
120 | 119 | dates = [] |
121 | - dates_text = sitting_row.xpath('.//td[@class="druga"]/text()').extract() | |
120 | + dates_text = dates_td.xpath('text()').extract() | |
122 | 121 | |
123 | 122 | for date in dates_text: |
124 | 123 | date = date.replace('r.', '').strip() |
... | ... |
collector/projects/ppc/mappings.py
... | ... | @@ -381,7 +381,10 @@ SENAT_COMMITTEES = { |
381 | 381 | 'Komisja Ustawodawcza': {'abbrev': 'U', 'name': '--||--'}, |
382 | 382 | 'Komisja Zdrowia': {'abbrev': 'Z', 'name': '--||--'}, |
383 | 383 | |
384 | - 'Posiedzenie Przewodniczących Komisji': {'abbrev': 'PPK', 'name': '--||--'}, # oryginalnie pp | |
384 | + # Senackie, 10 kadencja | |
385 | + 'Komisja Nadzwyczajna do spraw Klimatu': {'abbrev': 'NK', 'name': '--||--'}, | |
386 | + | |
387 | + 'Posiedzenie Przewodniczących Komisji': {'abbrev': 'PPK', 'name': '--||--'} # oryginalnie pp | |
385 | 388 | } |
386 | 389 | |
387 | 390 | |
... | ... |
collector/projects/ppc/tests/test_snt_committees_pipeline.py
... | ... | @@ -43,9 +43,9 @@ class TestSenatCommitteesPipeline(TestCase): |
43 | 43 | committee='Komisja Obrony Narodowej', page=3, stop_sitting_no=47) |
44 | 44 | |
45 | 45 | process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10, |
46 | - committee='Komisja Ustawodawcza', page=11, stop_sitting_no=6) | |
46 | + committee='Komisja Ustawodawcza', page=12, stop_sitting_no=6) | |
47 | 47 | process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10, |
48 | - committee='Komisja Praw Człowieka, Praworządności i Petycji', page=5, stop_sitting_no=21) | |
48 | + committee='Komisja Praw Człowieka, Praworządności i Petycji', page=6, stop_sitting_no=21) | |
49 | 49 | |
50 | 50 | process.start() |
51 | 51 | |
... | ... | @@ -500,10 +500,7 @@ class TestSenatCommitteesPipeline(TestCase): |
500 | 500 | self.project.extract_documents() |
501 | 501 | |
502 | 502 | for doc in Document.objects.all(): |
503 | - if doc.id in ['201923-snt-stapx-00021-03']: | |
504 | - self.assertFalse(doc.chunks.exists()) | |
505 | - else: | |
506 | - self.assertTrue(doc.chunks.exists()) | |
503 | + self.assertTrue(doc.chunks.exists()) | |
507 | 504 | |
508 | 505 | for doc in Document.objects.all(): |
509 | 506 | doc.chunks.all().delete() |
... | ... | @@ -512,10 +509,7 @@ class TestSenatCommitteesPipeline(TestCase): |
512 | 509 | self.project.extract_documents() |
513 | 510 | |
514 | 511 | for doc in Document.objects.all(): |
515 | - if doc.id in ['201923-snt-stapx-00021-03']: | |
516 | - self.assertTrue(doc.broken_source) | |
517 | - else: | |
518 | - self.assertFalse(doc.broken_source) | |
512 | + self.assertFalse(doc.broken_source) | |
519 | 513 | |
520 | 514 | for doc in Document.objects.all(): |
521 | 515 | doc.chunks.all().delete() |
... | ... |