Commit badd76e0fb3222130c51d1beb7bf3c31365365bb
1 parent
a78e298f
Fix snt_committees downloader.
Showing
3 changed files
with
23 additions
and
27 deletions
collector/downloader/spiders/snt_committees.py
@@ -48,15 +48,14 @@ class SenatCommitteesSpider(scrapy.Spider): | @@ -48,15 +48,14 @@ class SenatCommitteesSpider(scrapy.Spider): | ||
48 | def _get_committees(self): | 48 | def _get_committees(self): |
49 | committees = {} | 49 | committees = {} |
50 | 50 | ||
51 | - committees_html = requests.post(urljoin(self.pipeline.source.url, 'prace/komisje-senackie'), | ||
52 | - headers={'user-agent': scrapy_settings.USER_AGENT}, | ||
53 | - data={'kadencja': str(self._term_no)}) | ||
54 | - response = scrapy.http.HtmlResponse(url=self.pipeline.source.url, body=committees_html.content) | 51 | + committees_url = urljoin(self.pipeline.source.url, f'prace/komisje-senackie/?kadencja={self._term_no}') |
52 | + committees_html = requests.post(committees_url, headers={'user-agent': scrapy_settings.USER_AGENT}) | ||
53 | + response = scrapy.http.HtmlResponse(url=committees_url, body=committees_html.content) | ||
55 | 54 | ||
56 | committees_divs = response.xpath('.//div[@class="nazwa-komisji"]') | 55 | committees_divs = response.xpath('.//div[@class="nazwa-komisji"]') |
57 | for div in committees_divs: | 56 | for div in committees_divs: |
58 | committee_url_parts = div.xpath('.//a/@href').extract_first().split(',') | 57 | committee_url_parts = div.xpath('.//a/@href').extract_first().split(',') |
59 | - committee_name = div.xpath('.//a/text()').extract_first() | 58 | + committee_name = div.xpath('.//a/span[@class="pseudo-link"]/text()').extract_first() |
60 | committees[committee_name] = {'url': committee_url_parts[-1], | 59 | committees[committee_name] = {'url': committee_url_parts[-1], |
61 | 'id': committee_url_parts[-2], | 60 | 'id': committee_url_parts[-2], |
62 | 'abbrev': utils.get_committee_abbrev('senat', committee_name)} | 61 | 'abbrev': utils.get_committee_abbrev('senat', committee_name)} |
@@ -72,7 +71,7 @@ class SenatCommitteesSpider(scrapy.Spider): | @@ -72,7 +71,7 @@ class SenatCommitteesSpider(scrapy.Spider): | ||
72 | if meta['finished']: | 71 | if meta['finished']: |
73 | return | 72 | return |
74 | 73 | ||
75 | - sittings = response.xpath('.//table[@class="tabela-posiedzenia"]/tbody/tr[not(@class="header")]') | 74 | + sittings = response.xpath('.//table[@class="tabela-posiedzenia-komisji"]/tbody/tr') |
76 | self._save_documents(meta, sittings) | 75 | self._save_documents(meta, sittings) |
77 | 76 | ||
78 | meta['page'] += 1 | 77 | meta['page'] += 1 |
@@ -81,25 +80,25 @@ class SenatCommitteesSpider(scrapy.Spider): | @@ -81,25 +80,25 @@ class SenatCommitteesSpider(scrapy.Spider): | ||
81 | 80 | ||
82 | def _save_documents(self, meta, documents): | 81 | def _save_documents(self, meta, documents): |
83 | for doc in documents: | 82 | for doc in documents: |
83 | + doc_info = doc.xpath('.//td') | ||
84 | 84 | ||
85 | - sitting_no = doc.xpath('.//td[@class="pierwsza"]/text()').extract_first().strip() | ||
86 | - | 85 | + sitting_no = doc_info[0].xpath('text()').extract_first().strip() |
87 | if sitting_no == '1' or (self._stop_sitting_no and int(sitting_no) <= self._stop_sitting_no): | 86 | if sitting_no == '1' or (self._stop_sitting_no and int(sitting_no) <= self._stop_sitting_no): |
88 | meta['finished'] = True | 87 | meta['finished'] = True |
89 | 88 | ||
90 | if self._committee_fully_downloaded(sitting_no, meta): | 89 | if self._committee_fully_downloaded(sitting_no, meta): |
91 | continue | 90 | continue |
92 | 91 | ||
93 | - dates = self._get_dates(doc) | 92 | + dates = self._get_dates(doc_info[1]) |
94 | 93 | ||
95 | - pdf_url = doc.xpath('.//td[@class="czwarta"]/div[@class="area-right"]/' | ||
96 | - 'div[@class="posiedzenia-komisje-stenogram"]/' | ||
97 | - 'a[text()="Stenogram - plik pdf"]/@href').extract_first() | 94 | + pdf_url = doc_info[2].xpath('.//div[@class="area-right"]/div[@class="posiedzenia-komisje-stenogram"]/' |
95 | + 'a[contains(text(), "Stenogram - plik pdf")]/@href').extract_first() | ||
98 | if pdf_url: | 96 | if pdf_url: |
99 | pdf_url = urljoin(self.pipeline.source.url, pdf_url) | 97 | pdf_url = urljoin(self.pipeline.source.url, pdf_url) |
100 | 98 | ||
101 | - html_url = doc.xpath('.//td[@class="czwarta"]/div[@class="area-right"]/' | ||
102 | - 'a[@class="stenogram-link"]/@href').extract_first() | 99 | + html_url = doc_info[2].xpath('./div[@class="area-right"]/div[@class="posiedzenia-komisje-stenogram"]/' |
100 | + 'a[contains(text(), "Stenogram - wersja html")]/@href').extract_first() | ||
101 | + | ||
103 | if html_url: | 102 | if html_url: |
104 | html_url = urljoin(self.pipeline.source.url, html_url) | 103 | html_url = urljoin(self.pipeline.source.url, html_url) |
105 | self._save_document(html_url, pdf_url, dates) | 104 | self._save_document(html_url, pdf_url, dates) |
@@ -116,9 +115,9 @@ class SenatCommitteesSpider(scrapy.Spider): | @@ -116,9 +115,9 @@ class SenatCommitteesSpider(scrapy.Spider): | ||
116 | 115 | ||
117 | return True | 116 | return True |
118 | 117 | ||
119 | - def _get_dates(self, sitting_row): | 118 | + def _get_dates(self, dates_td): |
120 | dates = [] | 119 | dates = [] |
121 | - dates_text = sitting_row.xpath('.//td[@class="druga"]/text()').extract() | 120 | + dates_text = dates_td.xpath('text()').extract() |
122 | 121 | ||
123 | for date in dates_text: | 122 | for date in dates_text: |
124 | date = date.replace('r.', '').strip() | 123 | date = date.replace('r.', '').strip() |
collector/projects/ppc/mappings.py
@@ -381,7 +381,10 @@ SENAT_COMMITTEES = { | @@ -381,7 +381,10 @@ SENAT_COMMITTEES = { | ||
381 | 'Komisja Ustawodawcza': {'abbrev': 'U', 'name': '--||--'}, | 381 | 'Komisja Ustawodawcza': {'abbrev': 'U', 'name': '--||--'}, |
382 | 'Komisja Zdrowia': {'abbrev': 'Z', 'name': '--||--'}, | 382 | 'Komisja Zdrowia': {'abbrev': 'Z', 'name': '--||--'}, |
383 | 383 | ||
384 | - 'Posiedzenie Przewodniczących Komisji': {'abbrev': 'PPK', 'name': '--||--'}, # oryginalnie pp | 384 | + # Senackie, 10 kadencja |
385 | + 'Komisja Nadzwyczajna do spraw Klimatu': {'abbrev': 'NK', 'name': '--||--'}, | ||
386 | + | ||
387 | + 'Posiedzenie Przewodniczących Komisji': {'abbrev': 'PPK', 'name': '--||--'} # oryginalnie pp | ||
385 | } | 388 | } |
386 | 389 | ||
387 | 390 |
collector/projects/ppc/tests/test_snt_committees_pipeline.py
@@ -43,9 +43,9 @@ class TestSenatCommitteesPipeline(TestCase): | @@ -43,9 +43,9 @@ class TestSenatCommitteesPipeline(TestCase): | ||
43 | committee='Komisja Obrony Narodowej', page=3, stop_sitting_no=47) | 43 | committee='Komisja Obrony Narodowej', page=3, stop_sitting_no=47) |
44 | 44 | ||
45 | process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10, | 45 | process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10, |
46 | - committee='Komisja Ustawodawcza', page=11, stop_sitting_no=6) | 46 | + committee='Komisja Ustawodawcza', page=12, stop_sitting_no=6) |
47 | process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10, | 47 | process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10, |
48 | - committee='Komisja Praw Człowieka, Praworządności i Petycji', page=5, stop_sitting_no=21) | 48 | + committee='Komisja Praw Człowieka, Praworządności i Petycji', page=6, stop_sitting_no=21) |
49 | 49 | ||
50 | process.start() | 50 | process.start() |
51 | 51 | ||
@@ -500,10 +500,7 @@ class TestSenatCommitteesPipeline(TestCase): | @@ -500,10 +500,7 @@ class TestSenatCommitteesPipeline(TestCase): | ||
500 | self.project.extract_documents() | 500 | self.project.extract_documents() |
501 | 501 | ||
502 | for doc in Document.objects.all(): | 502 | for doc in Document.objects.all(): |
503 | - if doc.id in ['201923-snt-stapx-00021-03']: | ||
504 | - self.assertFalse(doc.chunks.exists()) | ||
505 | - else: | ||
506 | - self.assertTrue(doc.chunks.exists()) | 503 | + self.assertTrue(doc.chunks.exists()) |
507 | 504 | ||
508 | for doc in Document.objects.all(): | 505 | for doc in Document.objects.all(): |
509 | doc.chunks.all().delete() | 506 | doc.chunks.all().delete() |
@@ -512,10 +509,7 @@ class TestSenatCommitteesPipeline(TestCase): | @@ -512,10 +509,7 @@ class TestSenatCommitteesPipeline(TestCase): | ||
512 | self.project.extract_documents() | 509 | self.project.extract_documents() |
513 | 510 | ||
514 | for doc in Document.objects.all(): | 511 | for doc in Document.objects.all(): |
515 | - if doc.id in ['201923-snt-stapx-00021-03']: | ||
516 | - self.assertTrue(doc.broken_source) | ||
517 | - else: | ||
518 | - self.assertFalse(doc.broken_source) | 512 | + self.assertFalse(doc.broken_source) |
519 | 513 | ||
520 | for doc in Document.objects.all(): | 514 | for doc in Document.objects.all(): |
521 | doc.chunks.all().delete() | 515 | doc.chunks.all().delete() |