Projects STRLCPY Maryam Commits d4728c30
🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■
    maryam/core/util/urlib.py
    skipped 36 lines
    37 37   def unparse(self, urparse):
    38 38   return urlparse.urlunparse(urparse)
    39 39   
    40  - def sub_service(self, serv=None):
     40 + def sub_service(self, serv=None, ifany=False):
    41 41   '''Add protocol to url or replace it or clean it'''
    42 42   urparse = re.split(r'://', self.url)
    43 43   if not serv:
    skipped 3 lines
    47 47   # Add protocol
    48 48   serv = re.sub(r'://', '', serv)
    49 49   if len(urparse) == 2:
     50 + if ifany:
     51 + return self.url
    50 52   del urparse[0]
    51 53   url = f"{serv}://{''.join(urparse)}"
    52 54   else:
    skipped 99 lines
  • ■ ■ ■ ■ ■ ■
    maryam/core/util/web_scrap.py
    skipped 17 lines
    18 18  import re
    19 19  import concurrent.futures
    20 20   
    21  -# Web Scraper v5.1
     21 +# Web Scrap v5.1
    22 22   
    23 23  class main:
    24 24   
    skipped 3 lines
    28 28   url : First page address
    29 29   debug : Show the result at moment
    30 30   limit : Web scrap level(if it's 1 that's mean just search in first page)
    31  - thread_count : Count of links for open at per lap
     31 + thread_count : Number of links for each lap
    32 32   """
    33 33   self.framework = main.framework
    34  - self.url = url
     34 + self.parser = self.framework.urlib(url)
     35 + self.url = self.parser.sub_service(self.framework._global_options['protocol'], ifany=True)
    35 36   self.urlib = self.framework.urlib
    36 37   self.debug = debug
    37 38   self.limit = limit
    skipped 68 lines
    106 107   return True
    107 108   
    108 109   def joiner(self, url):
    109  - url = str(url)
    110  - # ADD slash to end url
     110 + url = url
    111 111   urparse = self.urlib(url)
    112 112   urparse.url = urparse.quote if '%' not in url else url
    113  - urparse2 = self.urlib(str(self.url))
    114  - cond1 = url.lower() in ('%20', '', '/', '%23', '#', 'https:', 'http:')
     113 + cond1 = url.lower() in ('%20', '', '/', '%23', '#', 'https:', 'http:') or '.' not in url
    115 114   cond12 = url.endswith(':')
    116 115   cond2 = len(
    117 116   urparse.url) > 1 and '%3a//' not in urparse.url and urparse.url[:2] != '//'
    skipped 1 lines
    119 118   if cond1 or cond12:
    120 119   return False
    121 120   elif cond2:
    122  - urparse.url = urparse2.join(url)
     121 + urparse.url = self.parser.join(url)
    123 122   elif cond3:
    124 123   urparse.url = url
    125 124   else:
    126  - urparse.url = urparse2.join(url)
    127  - return str(urparse.url)
     125 + urparse.url = self.parser.join(url)
     126 + return urparse.url
    128 127   
    129 128   def link_category(self, urls):
    130 129   links = []
    131 130   for url in urls:
    132 131   join = self.joiner(url)
    133  - 
    134 132   ##########################
    135 133   # ADD CDN, PHONE and EMAIL
    136 134   ##########################
    skipped 1 lines
    138 136   if cond1:
    139 137   continue
    140 138   
    141  - ends = join.endswith
    142 139   join = str(join).replace('\/', '/')
    143 140   ##########################
    144 141   # ADD OUT SCOPE
    skipped 9 lines
    154 151   if urparse.query != '':
    155 152   self._QUERY_LINKS = self.rept(join, self._QUERY_LINKS)
    156 153   
     154 + # If the link is a media link(mp4,..) not a web page
    157 155   broke = 0
    158 156   for ext in self.media_exts:
    159  - if (f'.{ext}/' in join) or ends(f'.{ext}'):
     157 + if (f'.{ext}/' in join) or join.endswith(f'.{ext}'):
    160 158   self._MEDIA = self.rept(join, self._MEDIA)
    161 159   broke = 1
    162 160   break
    skipped 20 lines
    183 181   return []
    184 182   self.passed.append(url)
    185 183   # Send Request
    186  - # try:
    187  - req = self.framework.request(url)
    188  - # except:
    189  - # return False
    190  - # else:
    191  - resp = req.text
     184 + try:
     185 + req = self.framework.request(url)
     186 + except:
     187 + return False
     188 + else:
     189 + resp = req.text
    192 190   
    193 191   pp = self.framework.page_parse(resp)
    194 192   
    skipped 140 lines
Please wait...
Page is in error, reload to recover