■ ■ ■ ■ ■ ■
snscrape/modules/telegram.py
| skipped 223 lines |
224 | 224 | | if '/s/' not in r.url: |
225 | 225 | | _logger.warning('No public post list for this user') |
226 | 226 | | return |
| 227 | + | nextPageUrl = '' |
227 | 228 | | while True: |
228 | 229 | | yield from self._soup_to_items(soup, r.url) |
| 230 | + | try: |
| 231 | + | if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1': |
| 232 | + | # if message 1 is the first message in the page, terminate scraping |
| 233 | + | break |
| 234 | + | except: |
| 235 | + | pass |
229 | 236 | | pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True}) |
230 | 237 | | if not pageLink: |
| 238 | + | # some pages are missing a "tme_messages_more" tag, causing early termination |
| 239 | + | if '=' not in nextPageUrl: |
| 240 | + | nextPageUrl = soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href'] |
231 | 241 | | nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20 |
232 | 242 | | if nextPostIndex > 20: |
233 | 243 | | pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'} |
| skipped 81 lines |