■ ■ ■ ■ ■ ■
maryam/core/util/web_scrap.py
| skipped 17 lines |
18 | 18 | | import re |
19 | 19 | | import concurrent.futures |
20 | 20 | | |
21 | | - | # Web Scraper v5.1 |
| 21 | + | # Web Scrap v5.1 |
22 | 22 | | |
23 | 23 | | class main: |
24 | 24 | | |
| skipped 3 lines |
28 | 28 | | url : First page address |
29 | 29 | | debug : Show the result at moment |
30 | 30 | | limit : Web scrap level(if it's 1 that's mean just search in first page) |
31 | | - | thread_count : Count of links for open at per lap |
| 31 | + | thread_count : Number of links for each lap |
32 | 32 | | """ |
33 | 33 | | self.framework = main.framework |
34 | | - | self.url = url |
| 34 | + | self.parser = self.framework.urlib(url) |
| 35 | + | self.url = self.parser.sub_service(self.framework._global_options['protocol'], ifany=True) |
35 | 36 | | self.urlib = self.framework.urlib |
36 | 37 | | self.debug = debug |
37 | 38 | | self.limit = limit |
| skipped 68 lines |
106 | 107 | | return True |
107 | 108 | | |
108 | 109 | | def joiner(self, url): |
109 | | - | url = str(url) |
110 | | - | # ADD slash to end url |
| 110 | + | url = url |
111 | 111 | | urparse = self.urlib(url) |
112 | 112 | | urparse.url = urparse.quote if '%' not in url else url |
113 | | - | urparse2 = self.urlib(str(self.url)) |
114 | | - | cond1 = url.lower() in ('%20', '', '/', '%23', '#', 'https:', 'http:') |
| 113 | + | cond1 = url.lower() in ('%20', '', '/', '%23', '#', 'https:', 'http:') or '.' not in url |
115 | 114 | | cond12 = url.endswith(':') |
116 | 115 | | cond2 = len( |
117 | 116 | | urparse.url) > 1 and '%3a//' not in urparse.url and urparse.url[:2] != '//' |
| skipped 1 lines |
119 | 118 | | if cond1 or cond12: |
120 | 119 | | return False |
121 | 120 | | elif cond2: |
122 | | - | urparse.url = urparse2.join(url) |
| 121 | + | urparse.url = self.parser.join(url) |
123 | 122 | | elif cond3: |
124 | 123 | | urparse.url = url |
125 | 124 | | else: |
126 | | - | urparse.url = urparse2.join(url) |
127 | | - | return str(urparse.url) |
| 125 | + | urparse.url = self.parser.join(url) |
| 126 | + | return urparse.url |
128 | 127 | | |
129 | 128 | | def link_category(self, urls): |
130 | 129 | | links = [] |
131 | 130 | | for url in urls: |
132 | 131 | | join = self.joiner(url) |
133 | | - | |
134 | 132 | | ########################## |
135 | 133 | | # ADD CDN, PHONE and EMAIL |
136 | 134 | | ########################## |
| skipped 1 lines |
138 | 136 | | if cond1: |
139 | 137 | | continue |
140 | 138 | | |
141 | | - | ends = join.endswith |
142 | 139 | | join = str(join).replace('\/', '/') |
143 | 140 | | ########################## |
144 | 141 | | # ADD OUT SCOPE |
| skipped 9 lines |
154 | 151 | | if urparse.query != '': |
155 | 152 | | self._QUERY_LINKS = self.rept(join, self._QUERY_LINKS) |
156 | 153 | | |
| 154 | + | # If the link is a media link(mp4,..) not a web page |
157 | 155 | | broke = 0 |
158 | 156 | | for ext in self.media_exts: |
159 | | - | if (f'.{ext}/' in join) or ends(f'.{ext}'): |
| 157 | + | if (f'.{ext}/' in join) or join.endswith(f'.{ext}'): |
160 | 158 | | self._MEDIA = self.rept(join, self._MEDIA) |
161 | 159 | | broke = 1 |
162 | 160 | | break |
| skipped 20 lines |
183 | 181 | | return [] |
184 | 182 | | self.passed.append(url) |
185 | 183 | | # Send Request |
186 | | - | # try: |
187 | | - | req = self.framework.request(url) |
188 | | - | # except: |
189 | | - | # return False |
190 | | - | # else: |
191 | | - | resp = req.text |
| 184 | + | try: |
| 185 | + | req = self.framework.request(url) |
| 186 | + | except: |
| 187 | + | return False |
| 188 | + | else: |
| 189 | + | resp = req.text |
192 | 190 | | |
193 | 191 | | pp = self.framework.page_parse(resp) |
194 | 192 | | |
| skipped 140 lines |