Projects STRLCPY maigret Commits c7977f1c
🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■
    Makefile
    skipped 5 lines
    6 6   coverage html
    7 7   
    8 8  rerun-tests:
    9  - pytest --lf
     9 + pytest --lf -vv
    10 10   
    11 11  lint:
    12 12   @echo 'syntax errors or undefined names'
    skipped 24 lines
  • ■ ■ ■ ■ ■ ■
    example.ipynb
     1 +{
     2 + "cells": [
     3 + {
     4 + "cell_type": "code",
     5 + "execution_count": null,
     6 + "metadata": {
     7 + "id": "8v6PEfyXb0Gx"
     8 + },
     9 + "outputs": [],
     10 + "source": [
     11 + "# clone the repo\n",
     12 + "!git clone https://github.com/soxoj/maigret\n",
     13 + "!pip3 install -r maigret/requirements.txt"
     14 + ]
     15 + },
     16 + {
     17 + "cell_type": "code",
     18 + "execution_count": null,
     19 + "metadata": {
     20 + "id": "cXOQUAhDchkl"
     21 + },
     22 + "outputs": [],
     23 + "source": [
     24 + "# help\n",
     25 + "!python3 maigret/maigret.py --help"
     26 + ]
     27 + },
     28 + {
     29 + "cell_type": "code",
     30 + "execution_count": null,
     31 + "metadata": {
     32 + "id": "SjDmpN4QGnJu"
     33 + },
     34 + "outputs": [],
     35 + "source": [
     36 + "# search\n",
     37 + "!python3 maigret/maigret.py user"
     38 + ]
     39 + }
     40 + ],
     41 + "metadata": {
     42 + "colab": {
     43 + "collapsed_sections": [],
     44 + "include_colab_link": true,
     45 + "name": "maigret.ipynb",
     46 + "provenance": []
     47 + },
     48 + "kernelspec": {
     49 + "display_name": "Python 3",
     50 + "language": "python",
     51 + "name": "python3"
     52 + },
     53 + "language_info": {
     54 + "codemirror_mode": {
     55 + "name": "ipython",
     56 + "version": 3
     57 + },
     58 + "file_extension": ".py",
     59 + "mimetype": "text/x-python",
     60 + "name": "python",
     61 + "nbconvert_exporter": "python",
     62 + "pygments_lexer": "ipython3",
     63 + "version": "3.7.10"
     64 + }
     65 + },
     66 + "nbformat": 4,
     67 + "nbformat_minor": 1
     68 +}
     69 + 
  • ■ ■ ■ ■ ■
    maigret/checking.py
    skipped 35 lines
    36 36   
    37 37   
    38 38  SUPPORTED_IDS = (
     39 + "username",
    39 40   "yandex_public_id",
    40 41   "gaia_id",
    41 42   "vk_id",
    skipped 840 lines
  • ■ ■ ■ ■ ■ ■
    maigret/maigret.py
    skipped 33 lines
    34 34   save_json_report,
    35 35   get_plaintext_report,
    36 36   sort_report_by_data_points,
     37 + save_graph_report,
    37 38  )
    38 39  from .sites import MaigretDatabase
    39 40  from .submit import Submitter
    skipped 20 lines
    60 61   query_notify.warning(
    61 62   'You can see detailed site check errors with a flag `--print-errors`'
    62 63   )
    63  - 
    64  - 
    65  -def extract_ids_from_url(url: str, db: MaigretDatabase) -> dict:
    66  - results = {}
    67  - for s in db.sites:
    68  - result = s.extract_id_from_url(url)
    69  - if not result:
    70  - continue
    71  - _id, _type = result
    72  - results[_id] = _type
    73  - return results
    74 64   
    75 65   
    76 66  def extract_ids_from_page(url, logger, timeout=5) -> dict:
    skipped 41 lines
    118 108   ids_results[u] = utype
    119 109   
    120 110   for url in dictionary.get('ids_links', []):
    121  - ids_results.update(extract_ids_from_url(url, db))
     111 + ids_results.update(db.extract_ids_from_url(url))
    122 112   
    123 113   return ids_results
    124 114   
    skipped 307 lines
    432 422   help="Generate a PDF report (general report on all usernames).",
    433 423   )
    434 424   report_group.add_argument(
     425 + "-G",
     426 + "--graph",
     427 + action="store_true",
     428 + dest="graph",
     429 + default=False,
     430 + help="Generate a graph report (general report on all usernames).",
     431 + )
     432 + report_group.add_argument(
    435 433   "-J",
    436 434   "--json",
    437 435   action="store",
    skipped 254 lines
    692 690   filename = report_filepath_tpl.format(username=username, postfix='.pdf')
    693 691   save_pdf_report(filename, report_context)
    694 692   query_notify.warning(f'PDF report on all usernames saved in {filename}')
     693 + 
     694 + if args.graph:
     695 + filename = report_filepath_tpl.format(username=username, postfix='.html')
     696 + save_graph_report(filename, general_results, db)
     697 + query_notify.warning(f'Graph report on all usernames saved in {filename}')
    695 698   
    696 699   text_report = get_plaintext_report(report_context)
    697 700   if text_report:
    skipped 19 lines
  • ■ ■ ■ ■ ■ ■
    maigret/report.py
     1 +import ast
    1 2  import csv
    2 3  import io
    3 4  import json
    skipped 7 lines
    11 12  from dateutil.parser import parse as parse_datetime_str
    12 13  from jinja2 import Template
    13 14  from xhtml2pdf import pisa
     15 +from pyvis.network import Network
     16 +import networkx as nx
    14 17   
     18 +from .checking import SUPPORTED_IDS
    15 19  from .result import QueryStatus
     20 +from .sites import MaigretDatabase
    16 21  from .utils import is_country_tag, CaseConverter, enrich_link_str
    17 22   
    18 23  SUPPORTED_JSON_REPORT_FORMATS = [
    skipped 61 lines
    80 85  def save_json_report(filename: str, username: str, results: dict, report_type: str):
    81 86   with open(filename, "w", encoding="utf-8") as f:
    82 87   generate_json_report(username, results, f, report_type=report_type)
     88 + 
     89 + 
     90 +class MaigretGraph:
     91 + other_params = {'size': 10, 'group': 3}
     92 + site_params = {'size': 15, 'group': 2}
     93 + username_params = {'size': 20, 'group': 1}
     94 + 
     95 + def __init__(self, graph):
     96 + self.G = graph
     97 + 
     98 + def add_node(self, key, value):
     99 + node_name = f'{key}: {value}'
     100 + 
     101 + params = self.other_params
     102 + if key in SUPPORTED_IDS:
     103 + params = self.username_params
     104 + elif value.startswith('http'):
     105 + params = self.site_params
     106 + 
     107 + self.G.add_node(node_name, title=node_name, **params)
     108 + 
     109 + if value != value.lower():
     110 + normalized_node_name = self.add_node(key, value.lower())
     111 + self.link(node_name, normalized_node_name)
     112 + 
     113 + return node_name
     114 + 
     115 + def link(self, node1_name, node2_name):
     116 + self.G.add_edge(node1_name, node2_name, weight=2)
     117 + 
     118 + 
     119 +def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
     120 + G = nx.Graph()
     121 + graph = MaigretGraph(G)
     122 + 
     123 + for username, id_type, results in username_results:
     124 + username_node_name = graph.add_node(id_type, username)
     125 + 
     126 + for website_name in results:
     127 + dictionary = results[website_name]
     128 + # TODO: fix no site data issue
     129 + if not dictionary:
     130 + continue
     131 + 
     132 + if dictionary.get("is_similar"):
     133 + continue
     134 + 
     135 + status = dictionary.get("status")
     136 + if not status: # FIXME: currently in case of timeout
     137 + continue
     138 + 
     139 + if dictionary["status"].status != QueryStatus.CLAIMED:
     140 + continue
     141 + 
     142 + site_fallback_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
     143 + # site_node_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
     144 + site_node_name = graph.add_node('site', site_fallback_name)
     145 + graph.link(username_node_name, site_node_name)
     146 + 
     147 + def process_ids(parent_node, ids):
     148 + for k, v in ids.items():
     149 + if k.endswith('_count') or k.startswith('is_') or k.endswith('_at'):
     150 + continue
     151 + if k in 'image':
     152 + continue
     153 + 
     154 + v_data = v
     155 + if v.startswith('['):
     156 + try:
     157 + v_data = ast.literal_eval(v)
     158 + except Exception as e:
     159 + logging.error(e)
     160 + 
     161 + # value is a list
     162 + if isinstance(v_data, list):
     163 + list_node_name = graph.add_node(k, site_fallback_name)
     164 + for vv in v_data:
     165 + data_node_name = graph.add_node(vv, site_fallback_name)
     166 + graph.link(list_node_name, data_node_name)
     167 + 
     168 + add_ids = {a: b for b, a in db.extract_ids_from_url(vv).items()}
     169 + if add_ids:
     170 + process_ids(data_node_name, add_ids)
     171 + else:
     172 + # value is just a string
     173 + # ids_data_name = f'{k}: {v}'
     174 + # if ids_data_name == parent_node:
     175 + # continue
     176 + 
     177 + ids_data_name = graph.add_node(k, v)
     178 + # G.add_node(ids_data_name, size=10, title=ids_data_name, group=3)
     179 + graph.link(parent_node, ids_data_name)
     180 + 
     181 + # check for username
     182 + if 'username' in k or k in SUPPORTED_IDS:
     183 + new_username_node_name = graph.add_node('username', v)
     184 + graph.link(ids_data_name, new_username_node_name)
     185 + 
     186 + add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
     187 + if add_ids:
     188 + process_ids(ids_data_name, add_ids)
     189 + 
     190 + if status.ids_data:
     191 + process_ids(site_node_name, status.ids_data)
     192 + 
     193 + nodes_to_remove = []
     194 + for node in G.nodes:
     195 + if len(str(node)) > 100:
     196 + nodes_to_remove.append(node)
     197 + 
     198 + [G.remove_node(node) for node in nodes_to_remove]
     199 + 
     200 + nt = Network(notebook=True, height="750px", width="100%")
     201 + nt.from_nx(G)
     202 + nt.show(filename)
    83 203   
    84 204   
    85 205  def get_plaintext_report(context: dict) -> str:
    skipped 314 lines
  • ■ ■ ■ ■ ■ ■
    maigret/resources/data.json
    skipped 3642 lines
    3643 3643   "errors": {
    3644 3644   "Invalid API key": "New API key needed"
    3645 3645   },
     3646 + "regexCheck": "^[^/]+$",
    3646 3647   "urlProbe": "https://disqus.com/api/3.0/users/details?user=username%3A{username}&attach=userFlaggedUser&api_key=E8Uh5l5fHZ6gD8U3KycjAIAk46f68Zw7C6eW8WSjZvCLXebZ7p0r1yrYDrLilk2F",
    3647 3648   "checkType": "status_code",
    3648 3649   "presenseStrs": [
    skipped 9387 lines
    13036 13037   "us"
    13037 13038   ],
    13038 13039   "headers": {
    13039  - "authorization": "Bearer BQAkHoH1XLhjIl6oh6r9YzH3kHC1OZg3UXgLiz39FzqRFh_xQrFaVrZcU-esM-t87B6Hqdc4L1HBgukKnWE"
     13040 + "authorization": "Bearer BQBbhm9gxBxIDmwZvO8mzV28G7V07L57WlKILvhXijRaTxwh9N03yHxSLADfioU3uWYDAjjq_mMWQSbQ2OA"
    13040 13041   },
    13041 13042   "errors": {
    13042 13043   "Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
    skipped 1420 lines
    14463 14464   "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
    14464 14465   "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
    14465 14466   "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
    14466  - "x-guest-token": "1403829602053771266"
     14467 + "x-guest-token": "1404906435025195008"
    14467 14468   },
    14468 14469   "errors": {
    14469 14470   "Bad guest token": "x-guest-token update required"
    skipped 400 lines
    14870 14871   "video"
    14871 14872   ],
    14872 14873   "headers": {
    14873  - "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM1MzQ5NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.5T8_p_q9zXOHXI2FT_XtMhsZUJMtPgCIaqwVF2u4aZI"
     14874 + "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM3OTYyNjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.ZUCw6WWIPIoVy1zoj8AKA1EMfX6ao7hJI2pWxgAZlac"
    14874 14875   },
    14875 14876   "activation": {
    14876 14877   "url": "https://vimeo.com/_rv/viewer",
    skipped 13644 lines
  • ■ ■ ■ ■ ■ ■
    maigret/sites.py
    skipped 399 lines
    400 400   
    401 401   return found_flags
    402 402   
     403 + 
     404 + def extract_ids_from_url(self, url: str) -> dict:
     405 + results = {}
     406 + for s in self._sites:
     407 + result = s.extract_id_from_url(url)
     408 + if not result:
     409 + continue
     410 + _id, _type = result
     411 + results[_id] = _type
     412 + return results
     413 + 
     414 + 
    403 415   def get_db_stats(self, sites_dict):
    404 416   if not sites_dict:
    405 417   sites_dict = self.sites_dict()
    skipped 36 lines
  • ■ ■ ■ ■ ■ ■
    requirements.txt
    skipped 36 lines
    37 37  xhtml2pdf==0.2.5
    38 38  XMind==1.2.0
    39 39  yarl==1.6.3
     40 +networkx==2.5.1
     41 +pyvis==0.1.9
    40 42   
  • ■ ■ ■ ■ ■
    tests/test_cli.py
    skipped 12 lines
    13 13   'disable_recursive_search': False,
    14 14   'folderoutput': 'reports',
    15 15   'html': False,
     16 + 'graph': False,
    16 17   'id_type': 'username',
    17 18   'ignore_ids_list': [],
    18 19   'info': False,
    skipped 80 lines
  • ■ ■ ■ ■ ■ ■
    tests/test_maigret.py
    skipped 8 lines
    9 9  from maigret.maigret import (
    10 10   extract_ids_from_page,
    11 11   extract_ids_from_results,
    12  - extract_ids_from_url,
    13 12  )
    14 13  from maigret.sites import MaigretSite
    15 14  from maigret.result import QueryResult, QueryStatus
    skipped 128 lines
    144 143   
    145 144   
    146 145  def test_extract_ids_from_url(default_db):
    147  - assert extract_ids_from_url('https://www.reddit.com/user/test', default_db) == {
     146 + assert default_db.extract_ids_from_url('https://www.reddit.com/user/test') == {
    148 147   'test': 'username'
    149 148   }
    150  - assert extract_ids_from_url('https://vk.com/id123', default_db) == {'123': 'vk_id'}
    151  - assert extract_ids_from_url('https://vk.com/ida123', default_db) == {
     149 + assert default_db.extract_ids_from_url('https://vk.com/id123') == {'123': 'vk_id'}
     150 + assert default_db.extract_ids_from_url('https://vk.com/ida123') == {
    152 151   'ida123': 'username'
    153 152   }
    154  - assert extract_ids_from_url(
    155  - 'https://my.mail.ru/yandex.ru/dipres8904/', default_db
     153 + assert default_db.extract_ids_from_url(
     154 + 'https://my.mail.ru/yandex.ru/dipres8904/'
    156 155   ) == {'dipres8904': 'username'}
    157  - assert extract_ids_from_url(
    158  - 'https://reviews.yandex.ru/user/adbced123', default_db
     156 + assert default_db.extract_ids_from_url(
     157 + 'https://reviews.yandex.ru/user/adbced123'
    159 158   ) == {'adbced123': 'yandex_public_id'}
    160 159   
    161 160   
    skipped 18 lines
Please wait...
Page is in error, reload to recover