diff --git a/collector/downloader/spiders/snt_committees.py b/collector/downloader/spiders/snt_committees.py index db1ff5c..b79bda5 100644 --- a/collector/downloader/spiders/snt_committees.py +++ b/collector/downloader/spiders/snt_committees.py @@ -48,15 +48,14 @@ class SenatCommitteesSpider(scrapy.Spider): def _get_committees(self): committees = {} - committees_html = requests.post(urljoin(self.pipeline.source.url, 'prace/komisje-senackie'), - headers={'user-agent': scrapy_settings.USER_AGENT}, - data={'kadencja': str(self._term_no)}) - response = scrapy.http.HtmlResponse(url=self.pipeline.source.url, body=committees_html.content) + committees_url = urljoin(self.pipeline.source.url, f'prace/komisje-senackie/?kadencja={self._term_no}') + committees_html = requests.post(committees_url, headers={'user-agent': scrapy_settings.USER_AGENT}) + response = scrapy.http.HtmlResponse(url=committees_url, body=committees_html.content) committees_divs = response.xpath('.//div[@class="nazwa-komisji"]') for div in committees_divs: committee_url_parts = div.xpath('.//a/@href').extract_first().split(',') - committee_name = div.xpath('.//a/text()').extract_first() + committee_name = div.xpath('.//a/span[@class="pseudo-link"]/text()').extract_first() committees[committee_name] = {'url': committee_url_parts[-1], 'id': committee_url_parts[-2], 'abbrev': utils.get_committee_abbrev('senat', committee_name)} @@ -72,7 +71,7 @@ class SenatCommitteesSpider(scrapy.Spider): if meta['finished']: return - sittings = response.xpath('.//table[@class="tabela-posiedzenia"]/tbody/tr[not(@class="header")]') + sittings = response.xpath('.//table[@class="tabela-posiedzenia-komisji"]/tbody/tr') self._save_documents(meta, sittings) meta['page'] += 1 @@ -81,25 +80,25 @@ class SenatCommitteesSpider(scrapy.Spider): def _save_documents(self, meta, documents): for doc in documents: + doc_info = doc.xpath('.//td') - sitting_no = doc.xpath('.//td[@class="pierwsza"]/text()').extract_first().strip() - + sitting_no = doc_info[0].xpath('text()').extract_first().strip() if sitting_no == '1' or (self._stop_sitting_no and int(sitting_no) <= self._stop_sitting_no): meta['finished'] = True if self._committee_fully_downloaded(sitting_no, meta): continue - dates = self._get_dates(doc) + dates = self._get_dates(doc_info[1]) - pdf_url = doc.xpath('.//td[@class="czwarta"]/div[@class="area-right"]/' - 'div[@class="posiedzenia-komisje-stenogram"]/' - 'a[text()="Stenogram - plik pdf"]/@href').extract_first() + pdf_url = doc_info[2].xpath('.//div[@class="area-right"]/div[@class="posiedzenia-komisje-stenogram"]/' + 'a[contains(text(), "Stenogram - plik pdf")]/@href').extract_first() if pdf_url: pdf_url = urljoin(self.pipeline.source.url, pdf_url) - html_url = doc.xpath('.//td[@class="czwarta"]/div[@class="area-right"]/' - 'a[@class="stenogram-link"]/@href').extract_first() + html_url = doc_info[2].xpath('./div[@class="area-right"]/div[@class="posiedzenia-komisje-stenogram"]/' + 'a[contains(text(), "Stenogram - wersja html")]/@href').extract_first() + if html_url: html_url = urljoin(self.pipeline.source.url, html_url) self._save_document(html_url, pdf_url, dates) @@ -116,9 +115,9 @@ class SenatCommitteesSpider(scrapy.Spider): return True - def _get_dates(self, sitting_row): + def _get_dates(self, dates_td): dates = [] - dates_text = sitting_row.xpath('.//td[@class="druga"]/text()').extract() + dates_text = dates_td.xpath('text()').extract() for date in dates_text: date = date.replace('r.', '').strip() diff --git a/collector/projects/ppc/mappings.py b/collector/projects/ppc/mappings.py index 1491a22..8806f4d 100644 --- a/collector/projects/ppc/mappings.py +++ b/collector/projects/ppc/mappings.py @@ -381,7 +381,10 @@ SENAT_COMMITTEES = { 'Komisja Ustawodawcza': {'abbrev': 'U', 'name': '--||--'}, 'Komisja Zdrowia': {'abbrev': 'Z', 'name': '--||--'}, - 'Posiedzenie Przewodniczących Komisji': {'abbrev': 'PPK', 'name': '--||--'}, # oryginalnie pp + # Senackie, 10 kadencja + 'Komisja Nadzwyczajna do spraw Klimatu': {'abbrev': 'NK', 'name': '--||--'}, + + 'Posiedzenie Przewodniczących Komisji': {'abbrev': 'PPK', 'name': '--||--'} # oryginalnie pp } diff --git a/collector/projects/ppc/tests/test_snt_committees_pipeline.py b/collector/projects/ppc/tests/test_snt_committees_pipeline.py index 66ad0cb..fd4c37e 100644 --- a/collector/projects/ppc/tests/test_snt_committees_pipeline.py +++ b/collector/projects/ppc/tests/test_snt_committees_pipeline.py @@ -43,9 +43,9 @@ class TestSenatCommitteesPipeline(TestCase): committee='Komisja Obrony Narodowej', page=3, stop_sitting_no=47) process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10, - committee='Komisja Ustawodawcza', page=11, stop_sitting_no=6) + committee='Komisja Ustawodawcza', page=12, stop_sitting_no=6) process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10, - committee='Komisja Praw Człowieka, Praworządności i Petycji', page=5, stop_sitting_no=21) + committee='Komisja Praw Człowieka, Praworządności i Petycji', page=6, stop_sitting_no=21) process.start() @@ -500,10 +500,7 @@ class TestSenatCommitteesPipeline(TestCase): self.project.extract_documents() for doc in Document.objects.all(): - if doc.id in ['201923-snt-stapx-00021-03']: - self.assertFalse(doc.chunks.exists()) - else: - self.assertTrue(doc.chunks.exists()) + self.assertTrue(doc.chunks.exists()) for doc in Document.objects.all(): doc.chunks.all().delete() @@ -512,10 +509,7 @@ class TestSenatCommitteesPipeline(TestCase): self.project.extract_documents() for doc in Document.objects.all(): - if doc.id in ['201923-snt-stapx-00021-03']: - self.assertTrue(doc.broken_source) - else: - self.assertFalse(doc.broken_source) + self.assertFalse(doc.broken_source) for doc in Document.objects.all(): doc.chunks.all().delete()