Commit badd76e0fb3222130c51d1beb7bf3c31365365bb

Authored by Bartłomiej Nitoń
1 parent a78e298f

Fix snt_committees downloader.

collector/downloader/spiders/snt_committees.py
@@ -48,15 +48,14 @@ class SenatCommitteesSpider(scrapy.Spider): @@ -48,15 +48,14 @@ class SenatCommitteesSpider(scrapy.Spider):
48 def _get_committees(self): 48 def _get_committees(self):
49 committees = {} 49 committees = {}
50 50
51 - committees_html = requests.post(urljoin(self.pipeline.source.url, 'prace/komisje-senackie'),  
52 - headers={'user-agent': scrapy_settings.USER_AGENT},  
53 - data={'kadencja': str(self._term_no)})  
54 - response = scrapy.http.HtmlResponse(url=self.pipeline.source.url, body=committees_html.content) 51 + committees_url = urljoin(self.pipeline.source.url, f'prace/komisje-senackie/?kadencja={self._term_no}')
  52 + committees_html = requests.post(committees_url, headers={'user-agent': scrapy_settings.USER_AGENT})
  53 + response = scrapy.http.HtmlResponse(url=committees_url, body=committees_html.content)
55 54
56 committees_divs = response.xpath('.//div[@class="nazwa-komisji"]') 55 committees_divs = response.xpath('.//div[@class="nazwa-komisji"]')
57 for div in committees_divs: 56 for div in committees_divs:
58 committee_url_parts = div.xpath('.//a/@href').extract_first().split(',') 57 committee_url_parts = div.xpath('.//a/@href').extract_first().split(',')
59 - committee_name = div.xpath('.//a/text()').extract_first() 58 + committee_name = div.xpath('.//a/span[@class="pseudo-link"]/text()').extract_first()
60 committees[committee_name] = {'url': committee_url_parts[-1], 59 committees[committee_name] = {'url': committee_url_parts[-1],
61 'id': committee_url_parts[-2], 60 'id': committee_url_parts[-2],
62 'abbrev': utils.get_committee_abbrev('senat', committee_name)} 61 'abbrev': utils.get_committee_abbrev('senat', committee_name)}
@@ -72,7 +71,7 @@ class SenatCommitteesSpider(scrapy.Spider): @@ -72,7 +71,7 @@ class SenatCommitteesSpider(scrapy.Spider):
72 if meta['finished']: 71 if meta['finished']:
73 return 72 return
74 73
75 - sittings = response.xpath('.//table[@class="tabela-posiedzenia"]/tbody/tr[not(@class="header")]') 74 + sittings = response.xpath('.//table[@class="tabela-posiedzenia-komisji"]/tbody/tr')
76 self._save_documents(meta, sittings) 75 self._save_documents(meta, sittings)
77 76
78 meta['page'] += 1 77 meta['page'] += 1
@@ -81,25 +80,25 @@ class SenatCommitteesSpider(scrapy.Spider): @@ -81,25 +80,25 @@ class SenatCommitteesSpider(scrapy.Spider):
81 80
82 def _save_documents(self, meta, documents): 81 def _save_documents(self, meta, documents):
83 for doc in documents: 82 for doc in documents:
  83 + doc_info = doc.xpath('.//td')
84 84
85 - sitting_no = doc.xpath('.//td[@class="pierwsza"]/text()').extract_first().strip()  
86 - 85 + sitting_no = doc_info[0].xpath('text()').extract_first().strip()
87 if sitting_no == '1' or (self._stop_sitting_no and int(sitting_no) <= self._stop_sitting_no): 86 if sitting_no == '1' or (self._stop_sitting_no and int(sitting_no) <= self._stop_sitting_no):
88 meta['finished'] = True 87 meta['finished'] = True
89 88
90 if self._committee_fully_downloaded(sitting_no, meta): 89 if self._committee_fully_downloaded(sitting_no, meta):
91 continue 90 continue
92 91
93 - dates = self._get_dates(doc) 92 + dates = self._get_dates(doc_info[1])
94 93
95 - pdf_url = doc.xpath('.//td[@class="czwarta"]/div[@class="area-right"]/'  
96 - 'div[@class="posiedzenia-komisje-stenogram"]/'  
97 - 'a[text()="Stenogram - plik pdf"]/@href').extract_first() 94 + pdf_url = doc_info[2].xpath('.//div[@class="area-right"]/div[@class="posiedzenia-komisje-stenogram"]/'
  95 + 'a[contains(text(), "Stenogram - plik pdf")]/@href').extract_first()
98 if pdf_url: 96 if pdf_url:
99 pdf_url = urljoin(self.pipeline.source.url, pdf_url) 97 pdf_url = urljoin(self.pipeline.source.url, pdf_url)
100 98
101 - html_url = doc.xpath('.//td[@class="czwarta"]/div[@class="area-right"]/'  
102 - 'a[@class="stenogram-link"]/@href').extract_first() 99 + html_url = doc_info[2].xpath('./div[@class="area-right"]/div[@class="posiedzenia-komisje-stenogram"]/'
  100 + 'a[contains(text(), "Stenogram - wersja html")]/@href').extract_first()
  101 +
103 if html_url: 102 if html_url:
104 html_url = urljoin(self.pipeline.source.url, html_url) 103 html_url = urljoin(self.pipeline.source.url, html_url)
105 self._save_document(html_url, pdf_url, dates) 104 self._save_document(html_url, pdf_url, dates)
@@ -116,9 +115,9 @@ class SenatCommitteesSpider(scrapy.Spider): @@ -116,9 +115,9 @@ class SenatCommitteesSpider(scrapy.Spider):
116 115
117 return True 116 return True
118 117
119 - def _get_dates(self, sitting_row): 118 + def _get_dates(self, dates_td):
120 dates = [] 119 dates = []
121 - dates_text = sitting_row.xpath('.//td[@class="druga"]/text()').extract() 120 + dates_text = dates_td.xpath('text()').extract()
122 121
123 for date in dates_text: 122 for date in dates_text:
124 date = date.replace('r.', '').strip() 123 date = date.replace('r.', '').strip()
collector/projects/ppc/mappings.py
@@ -381,7 +381,10 @@ SENAT_COMMITTEES = { @@ -381,7 +381,10 @@ SENAT_COMMITTEES = {
381 'Komisja Ustawodawcza': {'abbrev': 'U', 'name': '--||--'}, 381 'Komisja Ustawodawcza': {'abbrev': 'U', 'name': '--||--'},
382 'Komisja Zdrowia': {'abbrev': 'Z', 'name': '--||--'}, 382 'Komisja Zdrowia': {'abbrev': 'Z', 'name': '--||--'},
383 383
384 - 'Posiedzenie Przewodniczących Komisji': {'abbrev': 'PPK', 'name': '--||--'}, # oryginalnie pp 384 + # Senackie, 10 kadencja
  385 + 'Komisja Nadzwyczajna do spraw Klimatu': {'abbrev': 'NK', 'name': '--||--'},
  386 +
  387 + 'Posiedzenie Przewodniczących Komisji': {'abbrev': 'PPK', 'name': '--||--'} # oryginalnie pp
385 } 388 }
386 389
387 390
collector/projects/ppc/tests/test_snt_committees_pipeline.py
@@ -43,9 +43,9 @@ class TestSenatCommitteesPipeline(TestCase): @@ -43,9 +43,9 @@ class TestSenatCommitteesPipeline(TestCase):
43 committee='Komisja Obrony Narodowej', page=3, stop_sitting_no=47) 43 committee='Komisja Obrony Narodowej', page=3, stop_sitting_no=47)
44 44
45 process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10, 45 process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10,
46 - committee='Komisja Ustawodawcza', page=11, stop_sitting_no=6) 46 + committee='Komisja Ustawodawcza', page=12, stop_sitting_no=6)
47 process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10, 47 process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10,
48 - committee='Komisja Praw Człowieka, Praworządności i Petycji', page=5, stop_sitting_no=21) 48 + committee='Komisja Praw Człowieka, Praworządności i Petycji', page=6, stop_sitting_no=21)
49 49
50 process.start() 50 process.start()
51 51
@@ -500,10 +500,7 @@ class TestSenatCommitteesPipeline(TestCase): @@ -500,10 +500,7 @@ class TestSenatCommitteesPipeline(TestCase):
500 self.project.extract_documents() 500 self.project.extract_documents()
501 501
502 for doc in Document.objects.all(): 502 for doc in Document.objects.all():
503 - if doc.id in ['201923-snt-stapx-00021-03']:  
504 - self.assertFalse(doc.chunks.exists())  
505 - else:  
506 - self.assertTrue(doc.chunks.exists()) 503 + self.assertTrue(doc.chunks.exists())
507 504
508 for doc in Document.objects.all(): 505 for doc in Document.objects.all():
509 doc.chunks.all().delete() 506 doc.chunks.all().delete()
@@ -512,10 +509,7 @@ class TestSenatCommitteesPipeline(TestCase): @@ -512,10 +509,7 @@ class TestSenatCommitteesPipeline(TestCase):
512 self.project.extract_documents() 509 self.project.extract_documents()
513 510
514 for doc in Document.objects.all(): 511 for doc in Document.objects.all():
515 - if doc.id in ['201923-snt-stapx-00021-03']:  
516 - self.assertTrue(doc.broken_source)  
517 - else:  
518 - self.assertFalse(doc.broken_source) 512 + self.assertFalse(doc.broken_source)
519 513
520 for doc in Document.objects.all(): 514 for doc in Document.objects.all():
521 doc.chunks.all().delete() 515 doc.chunks.all().delete()