Projects STRLCPY maigret Commits a9543e83
🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■
    maigret/checking.py
    skipped 96 lines
    97 97   site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)
    98 98   
    99 99   
    100  -# TODO: move info separate module
     100 +# TODO: move to separate class
    101 101  def detect_error_page(html_text, status_code, fail_flags, ignore_403):
    102 102   # Detect service restrictions such as a country restriction
    103 103   for flag, msg in fail_flags.items():
    skipped 166 lines
    270 270   new_usernames[v] = k
    271 271   
    272 272   results_info['ids_usernames'] = new_usernames
     273 + results_info['ids_links'] = eval(extracted_ids_data.get('links', '[]'))
    273 274   result.ids_data = extracted_ids_data
    274 275   
    275 276   # Notify caller about results of query.
    skipped 327 lines
  • ■ ■ ■ ■ ■ ■
    maigret/maigret.py
    skipped 324 lines
    325 325   # TODO: fix no site data issue
    326 326   if not dictionary:
    327 327   continue
     328 + 
    328 329   new_usernames = dictionary.get('ids_usernames')
    329 330   if new_usernames:
    330 331   for u, utype in new_usernames.items():
    331 332   usernames[u] = utype
     333 + 
     334 + for url in dictionary.get('ids_links', []):
     335 + for s in db.sites:
     336 + u = s.detect_username(url)
     337 + if u:
     338 + usernames[u] = 'username'
    332 339   
    333 340   # reporting for a one username
    334 341   if args.xmind:
    skipped 53 lines
  • ■ ■ ■ ■ ■ ■
    maigret/sites.py
    skipped 1 lines
    2 2  """Maigret Sites Information"""
    3 3  import copy
    4 4  import json
     5 +import re
    5 6  import sys
    6 7   
    7 8  import requests
    8 9   
    9  -from .utils import CaseConverter
     10 +from .utils import CaseConverter, URLMatcher
    10 11   
    11 12   
    12 13  class MaigretEngine:
    skipped 8 lines
    21 22   
    22 23   
    23 24  class MaigretSite:
     25 + NOT_SERIALIZABLE_FIELDS = [
     26 + 'name',
     27 + 'engineData',
     28 + 'requestFuture',
     29 + 'detectedEngine',
     30 + 'engineObj',
     31 + 'stats',
     32 + 'urlRegexp',
     33 + ]
     34 + 
    24 35   def __init__(self, name, information):
    25 36   self.name = name
    26 37   
    skipped 30 lines
    57 68   # We do not know the popularity, so make site go to bottom of list.
    58 69   self.alexa_rank = sys.maxsize
    59 70   
     71 + self.update_detectors()
    60 72   
    61 73   def __str__(self):
    62 74   return f"{self.name} ({self.url_main})"
    63 75   
     76 + def update_detectors(self):
     77 + if 'url' in self.__dict__:
     78 + url = self.url
     79 + for group in ['urlMain', 'urlSubpath']:
     80 + if group in url:
     81 + url = url.replace('{'+group+'}', self.__dict__[CaseConverter.camel_to_snake(group)])
     82 + 
     83 + self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check)
     84 + 
     85 + def detect_username(self, url: str) -> str:
     86 + if self.url_regexp:
     87 + import logging
     88 + match_groups = self.url_regexp.match(url)
     89 + if match_groups:
     90 + return match_groups.groups()[-1].rstrip('/')
     91 + 
     92 + return None
     93 + 
    64 94   @property
    65 95   def json(self):
    66 96   result = {}
    skipped 3 lines
    70 100   # strip empty elements
    71 101   if v in (False, '', [], {}, None, sys.maxsize, 'username'):
    72 102   continue
    73  - if field in ['name', 'engineData', 'requestFuture', 'detectedEngine', 'engineObj', 'stats']:
     103 + if field in self.NOT_SERIALIZABLE_FIELDS:
    74 104   continue
    75 105   result[field] = v
    76 106   
    skipped 1 lines
    78 108   
    79 109   def update(self, updates: dict) -> MaigretSite:
    80 110   self.__dict__.update(updates)
     111 + self.update_detectors()
    81 112   
    82 113   return self
    83 114   
    skipped 11 lines
    95 126   self.__dict__[field] = v
    96 127   
    97 128   self.engine_obj = engine
     129 + self.update_detectors()
    98 130   
    99 131   return self
    100 132   
    skipped 2 lines
    103 135   return self
    104 136   
    105 137   self.request_future = None
     138 + self.url_regexp = None
     139 + 
    106 140   self_copy = copy.deepcopy(self)
    107 141   engine_data = self_copy.engine_obj.site
    108 142   site_data_keys = list(self_copy.__dict__.keys())
    skipped 181 lines
  • ■ ■ ■ ■ ■ ■
    maigret/utils.py
    1 1  import re
     2 +import sys
    2 3   
    3 4   
    4 5  class CaseConverter:
    skipped 24 lines
    29 30   if link.startswith('www.') or (link.startswith('http') and '//' in link):
    30 31   return f'<a class="auto-link" href="{link}">{link}</a>'
    31 32   return link
     33 + 
     34 + 
     35 +class URLMatcher:
     36 + _HTTP_URL_RE_STR = '^https?://(www.)?(.+)$'
     37 + HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR)
     38 + UNSAFE_SYMBOLS = '.?'
     39 + 
     40 + @classmethod
     41 + def extract_main_part(self, url: str) -> str:
     42 + match = self.HTTP_URL_RE.search(url)
     43 + if match and match.group(2):
     44 + return match.group(2).rstrip('/')
     45 + 
     46 + return ''
     47 + 
     48 + @classmethod
     49 + def make_profile_url_regexp(self, url: str, username_regexp: str = ''):
     50 + url_main_part = self.extract_main_part(url)
     51 + for c in self.UNSAFE_SYMBOLS:
     52 + url_main_part = url_main_part.replace(c, f'\\{c}')
     53 + username_regexp = username_regexp or '.+?'
     54 + 
     55 + url_regexp = url_main_part.replace('{username}', f'({username_regexp})')
     56 + regexp_str = self._HTTP_URL_RE_STR.replace('(.+)', url_regexp)
     57 + 
     58 + return re.compile(regexp_str)
  • ■ ■ ■ ■ ■ ■
    tests/test_sites.py
    skipped 112 lines
    113 113   assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
    114 114   
    115 115   
     116 +def test_site_url_detector():
     117 + db = MaigretDatabase()
     118 + db.load_from_json(EXAMPLE_DB)
     119 + 
     120 + assert db.sites[0].url_regexp.pattern == r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
     121 + assert db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test') == 'test'
     122 + 
     123 + 
    116 124  def test_ranked_sites_dict():
    117 125   db = MaigretDatabase()
    118 126   db.update_site(MaigretSite('3', {'alexaRank': 1000, 'engine': 'ucoz'}))
    skipped 53 lines
  • ■ ■ ■ ■ ■ ■
    tests/test_utils.py
    1 1  """Maigret utils test functions"""
    2  -from maigret.utils import CaseConverter, is_country_tag, enrich_link_str
     2 +import itertools
     3 +import re
     4 +from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher
    3 5   
    4 6   
    5 7  def test_case_convert_camel_to_snake():
    skipped 27 lines
    33 35   assert enrich_link_str('test') == 'test'
    34 36   assert enrich_link_str(' www.flickr.com/photos/alexaimephotography/') == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
    35 37   
     38 +def test_url_extract_main_part():
     39 + url_main_part = 'flickr.com/photos/alexaimephotography'
     40 + 
     41 + parts = [
     42 + ['http://', 'https://'],
     43 + ['www.', ''],
     44 + [url_main_part],
     45 + ['/', ''],
     46 + ]
     47 + 
     48 + url_regexp = re.compile('^https?://(www.)?flickr.com/photos/(.+?)$')
     49 + for url_parts in itertools.product(*parts):
     50 + url = ''.join(url_parts)
     51 + assert URLMatcher.extract_main_part(url) == url_main_part
     52 + assert not url_regexp.match(url) is None
     53 + 
     54 +def test_url_make_profile_url_regexp():
     55 + url_main_part = 'flickr.com/photos/{username}'
     56 + 
     57 + parts = [
     58 + ['http://', 'https://'],
     59 + ['www.', ''],
     60 + [url_main_part],
     61 + ['/', ''],
     62 + ]
     63 + 
     64 + for url_parts in itertools.product(*parts):
     65 + url = ''.join(url_parts)
     66 + assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'
     67 + 
Please wait...
Page is in error, reload to recover