Commit badd76e0fb3222130c51d1beb7bf3c31365365bb

Authored by Bartłomiej Nitoń
1 parent a78e298f

Fix snt_committees downloader.

collector/downloader/spiders/snt_committees.py
... ... @@ -48,15 +48,14 @@ class SenatCommitteesSpider(scrapy.Spider):
48 48 def _get_committees(self):
49 49 committees = {}
50 50  
51   - committees_html = requests.post(urljoin(self.pipeline.source.url, 'prace/komisje-senackie'),
52   - headers={'user-agent': scrapy_settings.USER_AGENT},
53   - data={'kadencja': str(self._term_no)})
54   - response = scrapy.http.HtmlResponse(url=self.pipeline.source.url, body=committees_html.content)
  51 + committees_url = urljoin(self.pipeline.source.url, f'prace/komisje-senackie/?kadencja={self._term_no}')
  52 + committees_html = requests.post(committees_url, headers={'user-agent': scrapy_settings.USER_AGENT})
  53 + response = scrapy.http.HtmlResponse(url=committees_url, body=committees_html.content)
55 54  
56 55 committees_divs = response.xpath('.//div[@class="nazwa-komisji"]')
57 56 for div in committees_divs:
58 57 committee_url_parts = div.xpath('.//a/@href').extract_first().split(',')
59   - committee_name = div.xpath('.//a/text()').extract_first()
  58 + committee_name = div.xpath('.//a/span[@class="pseudo-link"]/text()').extract_first()
60 59 committees[committee_name] = {'url': committee_url_parts[-1],
61 60 'id': committee_url_parts[-2],
62 61 'abbrev': utils.get_committee_abbrev('senat', committee_name)}
... ... @@ -72,7 +71,7 @@ class SenatCommitteesSpider(scrapy.Spider):
72 71 if meta['finished']:
73 72 return
74 73  
75   - sittings = response.xpath('.//table[@class="tabela-posiedzenia"]/tbody/tr[not(@class="header")]')
  74 + sittings = response.xpath('.//table[@class="tabela-posiedzenia-komisji"]/tbody/tr')
76 75 self._save_documents(meta, sittings)
77 76  
78 77 meta['page'] += 1
... ... @@ -81,25 +80,25 @@ class SenatCommitteesSpider(scrapy.Spider):
81 80  
82 81 def _save_documents(self, meta, documents):
83 82 for doc in documents:
  83 + doc_info = doc.xpath('.//td')
84 84  
85   - sitting_no = doc.xpath('.//td[@class="pierwsza"]/text()').extract_first().strip()
86   -
  85 + sitting_no = doc_info[0].xpath('text()').extract_first().strip()
87 86 if sitting_no == '1' or (self._stop_sitting_no and int(sitting_no) <= self._stop_sitting_no):
88 87 meta['finished'] = True
89 88  
90 89 if self._committee_fully_downloaded(sitting_no, meta):
91 90 continue
92 91  
93   - dates = self._get_dates(doc)
  92 + dates = self._get_dates(doc_info[1])
94 93  
95   - pdf_url = doc.xpath('.//td[@class="czwarta"]/div[@class="area-right"]/'
96   - 'div[@class="posiedzenia-komisje-stenogram"]/'
97   - 'a[text()="Stenogram - plik pdf"]/@href').extract_first()
  94 + pdf_url = doc_info[2].xpath('.//div[@class="area-right"]/div[@class="posiedzenia-komisje-stenogram"]/'
  95 + 'a[contains(text(), "Stenogram - plik pdf")]/@href').extract_first()
98 96 if pdf_url:
99 97 pdf_url = urljoin(self.pipeline.source.url, pdf_url)
100 98  
101   - html_url = doc.xpath('.//td[@class="czwarta"]/div[@class="area-right"]/'
102   - 'a[@class="stenogram-link"]/@href').extract_first()
  99 + html_url = doc_info[2].xpath('./div[@class="area-right"]/div[@class="posiedzenia-komisje-stenogram"]/'
  100 + 'a[contains(text(), "Stenogram - wersja html")]/@href').extract_first()
  101 +
103 102 if html_url:
104 103 html_url = urljoin(self.pipeline.source.url, html_url)
105 104 self._save_document(html_url, pdf_url, dates)
... ... @@ -116,9 +115,9 @@ class SenatCommitteesSpider(scrapy.Spider):
116 115  
117 116 return True
118 117  
119   - def _get_dates(self, sitting_row):
  118 + def _get_dates(self, dates_td):
120 119 dates = []
121   - dates_text = sitting_row.xpath('.//td[@class="druga"]/text()').extract()
  120 + dates_text = dates_td.xpath('text()').extract()
122 121  
123 122 for date in dates_text:
124 123 date = date.replace('r.', '').strip()
... ...
collector/projects/ppc/mappings.py
... ... @@ -381,7 +381,10 @@ SENAT_COMMITTEES = {
381 381 'Komisja Ustawodawcza': {'abbrev': 'U', 'name': '--||--'},
382 382 'Komisja Zdrowia': {'abbrev': 'Z', 'name': '--||--'},
383 383  
384   - 'Posiedzenie Przewodniczących Komisji': {'abbrev': 'PPK', 'name': '--||--'}, # oryginalnie pp
  384 + # Senackie, 10 kadencja
  385 + 'Komisja Nadzwyczajna do spraw Klimatu': {'abbrev': 'NK', 'name': '--||--'},
  386 +
  387 + 'Posiedzenie Przewodniczących Komisji': {'abbrev': 'PPK', 'name': '--||--'} # oryginalnie pp
385 388 }
386 389  
387 390  
... ...
collector/projects/ppc/tests/test_snt_committees_pipeline.py
... ... @@ -43,9 +43,9 @@ class TestSenatCommitteesPipeline(TestCase):
43 43 committee='Komisja Obrony Narodowej', page=3, stop_sitting_no=47)
44 44  
45 45 process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10,
46   - committee='Komisja Ustawodawcza', page=11, stop_sitting_no=6)
  46 + committee='Komisja Ustawodawcza', page=12, stop_sitting_no=6)
47 47 process.crawl(cls.pipeline_10.spider, pipeline_name=cls.pipeline_10.name, term_no=10,
48   - committee='Komisja Praw Człowieka, Praworządności i Petycji', page=5, stop_sitting_no=21)
  48 + committee='Komisja Praw Człowieka, Praworządności i Petycji', page=6, stop_sitting_no=21)
49 49  
50 50 process.start()
51 51  
... ... @@ -500,10 +500,7 @@ class TestSenatCommitteesPipeline(TestCase):
500 500 self.project.extract_documents()
501 501  
502 502 for doc in Document.objects.all():
503   - if doc.id in ['201923-snt-stapx-00021-03']:
504   - self.assertFalse(doc.chunks.exists())
505   - else:
506   - self.assertTrue(doc.chunks.exists())
  503 + self.assertTrue(doc.chunks.exists())
507 504  
508 505 for doc in Document.objects.all():
509 506 doc.chunks.all().delete()
... ... @@ -512,10 +509,7 @@ class TestSenatCommitteesPipeline(TestCase):
512 509 self.project.extract_documents()
513 510  
514 511 for doc in Document.objects.all():
515   - if doc.id in ['201923-snt-stapx-00021-03']:
516   - self.assertTrue(doc.broken_source)
517   - else:
518   - self.assertFalse(doc.broken_source)
  512 + self.assertFalse(doc.broken_source)
519 513  
520 514 for doc in Document.objects.all():
521 515 doc.chunks.all().delete()
... ...