STRLCPY/maigret

Merge pull request #53 from soxoj/json-reports-submit-improvements
```
Added JSON reports
```
soxoj committed with GitHub 4 years ago

53f72eda

2 parents
7676c053
631de7b3

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

■ ■ ■ ■ ■ ■

maigret/maigret.py

		skipped 12 lines
13	13		from .checking import *
14	14		from .notify import QueryNotifyPrint
15	15		from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \
16		-	generate_report_context, save_txt_report
	16	+	generate_report_context, save_txt_report, SUPPORTED_JSON_REPORT_FORMATS, check_supported_json_format, \
	17	+	save_json_report
17	18		from .submit import submit_dialog
18	19
19	20		__version__ = '0.1.13'
		skipped 36 lines
56	57		action="store", dest="proxy", default=None,
57	58		help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080"
58	59		)
59		-	parser.add_argument("--json", "-j", metavar="JSON_FILE",
60		-	dest="json_file", default=None,
61		-	help="Load data from a JSON file or an online, valid, JSON file.")
	60	+	parser.add_argument("--db", metavar="DB_FILE",
	61	+	dest="db_file", default=None,
	62	+	help="Load Maigret database from a JSON file or an online, valid, JSON file.")
62	63		parser.add_argument("--cookies-jar-file", metavar="COOKIE_FILE",
63	64		dest="cookie_file", default=None,
64	65		help="File with cookies.")
		skipped 26 lines
91	92		action="store_true", dest="print_check_errors", default=False,
92	93		help="Print errors messages: connection, captcha, site country ban, etc."
93	94		)
94		-	parser.add_argument("--submit",
	95	+	parser.add_argument("--submit", metavar='EXISTING_USER_URL',
95	96		type=str, dest="new_site_to_submit", default=False,
96	97		help="URL of existing profile in new site to submit."
97	98		)
		skipped 60 lines
158	159		dest="pdf", default=False,
159	160		help="Generate a PDF report (general report on all usernames)."
160	161		)
	162	+	parser.add_argument("-J", "--json",
	163	+	action="store", metavar='REPORT_TYPE',
	164	+	dest="json", default='', type=check_supported_json_format,
	165	+	help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
	166	+	" (one report per username)."
	167	+	)
161	168
162	169		args = parser.parse_args()
163	170
		skipped 42 lines
206	213		if args.tags:
207	214		args.tags = list(set(str(args.tags).split(',')))
208	215
209		-	if args.json_file is None:
210		-	args.json_file = \
	216	+	if args.db_file is None:
	217	+	args.db_file = \
211	218		os.path.join(os.path.dirname(os.path.realpath(__file__)),
212	219		"resources/data.json"
213	220		)
		skipped 9 lines
223	230		color=not args.no_color)
224	231
225	232		# Create object with all information about sites we are aware of.
226		-	db = MaigretDatabase().load_from_file(args.json_file)
	233	+	db = MaigretDatabase().load_from_file(args.db_file)
227	234		get_top_sites_for_id = lambda x: db.ranked_sites_dict(top=args.top_sites, tags=args.tags,
228	235		names=args.site_list,
229	236		disabled=False, id_type=x)
		skipped 3 lines
233	240		if args.new_site_to_submit:
234	241		is_submitted = await submit_dialog(db, args.new_site_to_submit)
235	242		if is_submitted:
236		-	db.save_to_file(args.json_file)
	243	+	db.save_to_file(args.db_file)
237	244
238	245		# Database self-checking
239	246		if args.self_check:
		skipped 1 lines
241	248		is_need_update = await self_check(db, site_data, logger, max_connections=args.connections)
242	249		if is_need_update:
243	250		if input('Do you want to save changes permanently? [yYnN]\n').lower() == 'y':
244		-	db.save_to_file(args.json_file)
	251	+	db.save_to_file(args.db_file)
245	252		print('Database was successfully updated.')
246	253		else:
247	254		print('Updates will be applied only for current search session.')
		skipped 91 lines
339	346		save_txt_report(filename, username, results)
340	347		query_notify.warning(f'TXT report for {username} saved in {filename}')
341	348
	349	+	if args.json:
	350	+	filename = report_filepath_tpl.format(username=username, postfix=f'_{args.json}.json')
	351	+	save_json_report(filename, username, results, report_type=args.json)
	352	+	query_notify.warning(f'JSON {args.json} report for {username} saved in {filename}')
	353	+
	354	+
342	355		# reporting for all the result
343	356		if general_results:
344	357		if args.html or args.pdf:
		skipped 12 lines
357	370		save_pdf_report(filename, report_context)
358	371		query_notify.warning(f'PDF report on all usernames saved in {filename}')
359	372		# update database
360		-	db.save_to_file(args.json_file)
	373	+	db.save_to_file(args.db_file)
361	374
362	375
363	376		def run():
		skipped 11 lines

■ ■ ■ ■ ■ ■

maigret/report.py

1	1		import csv
	2	+	import json
2	3		import io
3	4		import logging
4	5		import os
		skipped 2 lines
7	8		from datetime import datetime
8	9		from jinja2 import Template
9	10		from xhtml2pdf import pisa
	11	+	from argparse import ArgumentTypeError
10	12		from dateutil.parser import parse as parse_datetime_str
11	13
12	14		from .result import QueryStatus
13	15		from .utils import is_country_tag, CaseConverter, enrich_link_str
	16	+
	17	+	SUPPORTED_JSON_REPORT_FORMATS = [
	18	+	'simple',
	19	+	'ndjson',
	20	+	]
14	21
15	22
16	23		'''
		skipped 33 lines
50	57		filled_template = template.render(**context)
51	58		with open(filename, 'w+b') as f:
52	59		pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
	60	+
	61	+	def save_json_report(filename: str, username: str, results: dict, report_type: str):
	62	+	with open(filename, 'w', encoding='utf-8') as f:
	63	+	generate_json_report(username, results, f, report_type=report_type)
53	64
54	65
55	66		'''
		skipped 169 lines
225	236		file.write(dictionary["url_user"] + "\n")
226	237		file.write(f'Total Websites Username Detected On : {exists_counter}')
227	238
	239	+
	240	+	def generate_json_report(username: str, results: dict, file, report_type):
	241	+	exists_counter = 0
	242	+	is_report_per_line = report_type.startswith('ndjson')
	243	+	all_json = {}
	244	+
	245	+	for sitename in results:
	246	+	site_result = results[sitename]
	247	+	# TODO: fix no site data issue
	248	+	if not site_result or site_result.get("status").status != QueryStatus.CLAIMED:
	249	+	continue
	250	+
	251	+	data = dict(site_result)
	252	+	data['status'] = data['status'].json()
	253	+
	254	+	if is_report_per_line:
	255	+	data['sitename'] = sitename
	256	+	file.write(json.dumps(data)+'\n')
	257	+	else:
	258	+	all_json[sitename] = data
	259	+
	260	+	if not is_report_per_line:
	261	+	file.write(json.dumps(all_json))
	262	+
228	263		'''
229	264		XMIND 8 Functions
230	265		'''
		skipped 74 lines
305	340		currentsublabel = undefinedsection.addSubTopic()
306	341		currentsublabel.setTitle("%s: %s" % (k, v))
307	342
	343	+
	344	+	def check_supported_json_format(value):
	345	+	if value and not value in SUPPORTED_JSON_REPORT_FORMATS:
	346	+	raise ArgumentTypeError(f'JSON report type must be one of the following types: '
	347	+	+ ', '.join(SUPPORTED_JSON_REPORT_FORMATS))
	348	+	return value
308	349
309	350

■ ■ ■ ■ ■ ■

maigret/resources/data.json

		skipped 23095 lines
23096	23096		"urlMain": "https://www.are.na",
23097	23097		"usernameClaimed": "nate-cassel",
23098	23098		"usernameUnclaimed": "noonewouldeverusethis7"
	23099	+	},
	23100	+	"mywishboard.com": {
	23101	+	"checkType": "message",
	23102	+	"presenseStrs": [
	23103	+	"profile-header",
	23104	+	" profile-header__col"
	23105	+	],
	23106	+	"absenceStrs": [
	23107	+	"This page could not be found"
	23108	+	],
	23109	+	"url": "https://mywishboard.com/@{username}",
	23110	+	"urlMain": "https://mywishboard.com",
	23111	+	"usernameClaimed": "alex",
	23112	+	"usernameUnclaimed": "noonewouldeverusethis7"
	23113	+	},
	23114	+	"crafta.ua": {
	23115	+	"checkType": "message",
	23116	+	"presenseStrs": [
	23117	+	"cft-profile-about"
	23118	+	],
	23119	+	"absenceStrs": [
	23120	+	"Page not found"
	23121	+	],
	23122	+	"url": "https://{username}.crafta.ua/",
	23123	+	"urlMain": "https://crafta.ua",
	23124	+	"usernameClaimed": "test",
	23125	+	"usernameUnclaimed": "noonewouldeverusethis7"
	23126	+	},
	23127	+	"m.smutty.com": {
	23128	+	"tags": [
	23129	+	"erotic"
	23130	+	],
	23131	+	"checkType": "message",
	23132	+	"presenseStrs": [
	23133	+	"profile_stats_n"
	23134	+	],
	23135	+	"absenceStrs": [
	23136	+	"Not Found</span>"
	23137	+	],
	23138	+	"url": "https://m.smutty.com/user/{username}/",
	23139	+	"urlMain": "https://m.smutty.com",
	23140	+	"usernameClaimed": "alex",
	23141	+	"usernameUnclaimed": "noonewouldeverusethis7"
	23142	+	},
	23143	+	"www.marykay.ru": {
	23144	+	"checkType": "message",
	23145	+	"presenseStrs": [
	23146	+	"email"
	23147	+	],
	23148	+	"absenceStrs": [
	23149	+	"errorPage"
	23150	+	],
	23151	+	"url": "https://www.marykay.ru/{username}",
	23152	+	"urlMain": "https://www.marykay.ru",
	23153	+	"usernameClaimed": "anna",
	23154	+	"usernameUnclaimed": "noonewouldeverusethis7"
23099	23155		}
23100	23156		},
23101	23157		"engines": {
		skipped 103 lines

■ ■ ■ ■ ■ ■

maigret/result.py

1		-	"""Sherlock Result Module
	1	+	"""Maigret Result Module
2	2
3	3		This module defines various objects for recording the results of queries.
4	4		"""
		skipped 69 lines
74	74		self.ids_data = ids_data
75	75		self.tags = tags
76	76
	77	+	def json(self):
	78	+	return {
	79	+	'username': self.username,
	80	+	'site_name': self.site_name,
	81	+	'url': self.site_url_user,
	82	+	'status': str(self.status),
	83	+	'ids': self.ids_data or {},
	84	+	'tags': self.tags,
	85	+	}
77	86
78	87		def __str__(self):
79	88		"""Convert Object To String.
		skipped 15 lines

■ ■ ■ ■ ■ ■

maigret/submit.py

1	1		import difflib
	2	+	import json
2	3
3	4		import requests
4	5		from mock import Mock
		skipped 5 lines
10	11
11	12		RATIO = 0.6
12	13		TOP_FEATURES = 5
	14	+	URL_RE = re.compile(r'https?://(www\.)?')
13	15
14	16
15	17		def get_match_ratio(x):
		skipped 68 lines
84	86
85	87
86	88		async def submit_dialog(db, url_exists):
	89	+	domain_raw = URL_RE.sub('', url_exists).strip().strip('/')
	90	+	domain_raw = domain_raw.split('/')[0]
	91	+
	92	+	matched_sites = list(filter(lambda x: domain_raw in x.url_main+x.url, db.sites))
	93	+	if matched_sites:
	94	+	print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!')
	95	+	status = lambda s: '(disabled)' if s.disabled else ''
	96	+	url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}'
	97	+	print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites]))
	98	+	return False
	99	+
87	100		url_parts = url_exists.split('/')
88	101		supposed_username = url_parts[-1]
89	102		new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
		skipped 13 lines
103	116		a_minus_b = tokens_a.difference(tokens_b)
104	117		b_minus_a = tokens_b.difference(tokens_a)
105	118
106		-	top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: '))
107		-	if not top_features_count:
108		-	top_features_count = TOP_FEATURES
	119	+	top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: ') or TOP_FEATURES)
109	120
110	121		presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[:top_features_count]
111	122
		skipped 51 lines

■ ■ ■ ■ ■ ■ ■

tests/test_report.py

1	1		"""Maigret reports test functions"""
2	2		import copy
	3	+	import json
3	4		import os
4	5		from io import StringIO
5	6
		skipped 1 lines
7	8		from jinja2 import Template
8	9
9	10		from maigret.report import generate_csv_report, generate_txt_report, save_xmind_report, save_html_report, \
10		-	save_pdf_report, generate_report_template, generate_report_context
	11	+	save_pdf_report, generate_report_template, generate_report_context, generate_json_report
11	12		from maigret.result import QueryResult, QueryStatus
12	13
13	14		EXAMPLE_RESULTS = {
		skipped 130 lines
144	145		'https://www.github.com/test\n',
145	146		'Total Websites Username Detected On : 1',
146	147		]
	148	+
	149	+
	150	+	def test_generate_json_simple_report():
	151	+	jsonfile = StringIO()
	152	+	MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
	153	+	MODIFIED_RESULTS['GitHub2'] = EXAMPLE_RESULTS['GitHub']
	154	+	generate_json_report('test', MODIFIED_RESULTS, jsonfile, 'simple')
	155	+
	156	+	jsonfile.seek(0)
	157	+	data = jsonfile.readlines()
	158	+
	159	+	assert len(data) == 1
	160	+	assert list(json.loads(data[0]).keys()) == ['GitHub', 'GitHub2']
	161	+
	162	+
	163	+	def test_generate_json_ndjson_report():
	164	+	jsonfile = StringIO()
	165	+	MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
	166	+	MODIFIED_RESULTS['GitHub2'] = EXAMPLE_RESULTS['GitHub']
	167	+	generate_json_report('test', MODIFIED_RESULTS, jsonfile, 'ndjson')
	168	+
	169	+	jsonfile.seek(0)
	170	+	data = jsonfile.readlines()
	171	+
	172	+	assert len(data) == 2
	173	+	assert json.loads(data[0])['sitename'] == 'GitHub'
147	174
148	175
149	176		def test_save_xmind_report():
		skipped 35 lines

Merge pull request #53 from soxoj/json-reports-submit-improvements