in src/scripts/atel_crawler.py [0:0]
def _crawl_one(self, id):
rec = {'id': id}
rec['link'] = f'{self._url}/?read={id}'
xml = requests.get(
self._url,
params={'read': id},
headers=self._headers,
)
xml = BeautifulSoup(xml.text, 'html.parser')
xml = xml.find('div', id='telegram')
xml.find('div', id='tnav').decompose()
rec['title'] = xml.find('center').extract().get_text()
t = xml.find('em').extract()
xml.find('p', align='CENTER').decompose()
authors = t.find('strong').extract()
rec['authors'] = authors.get_text()
authors = authors.find('a')
if authors is not None:
rec['mailto'] = authors.attrs['href'].replace('mailto:', '')
else:
rec['mailto'] = ''
rec['date'] = t.find('strong').get_text()
rec['provenance'] = t.find('em').get_text() \
.replace('Credential Certification:', '').strip()
t = xml.find('div', id='subjects').extract()
rec['subjects'] = t.get_text().replace('Subjects:', '').strip()
xml.find('a', class_='twitter-share-button').decompose()
xml.find('script').decompose()
if xml.find('p').get_text().strip() == '':
xml.find('p').decompose()
rec['description'] = self._stripper.handle(repr(xml)).strip()
return rec