add all politicians - Commit 04f99b73 - STRLCPY/451-CorporateRiskMiner

add all politicians
Marko Sahan committed 2 years ago

04f99b73

1 parent ce1cb02c

Total 6 files Show one by one

■ ■ ■ ■ ■ ■

requirements.txt

1 1 elementpath
2 - enum
3 2 csv
3 + urllib
4 4 requests==2.22.0
5 5 beautifulsoup4==4.8.1

All occurrences

■ ■ ■ ■ ■ ■

sanctions_and_peps/README.md

		skipped 19 lines
20	20		python sanctions_and_peps/ru_bl_peps_parser.py -o /sanctions_and_peps/parsed/ru_bl_peps_parsed.csv
21	21		```
22	22
	23	+	All politicians:
	24	+
	25	+	The data are scraped from [GitHub](https://raw.githubusercontent.com/everypolitician/everypolitician-data/master/countries.json)
	26	+
	27	+	```
	28	+	python sanctions_and_peps/every_politician_parser.py -i /sanctions_and_peps/source/every_politician.json -o sanctions_and_peps/parsed/
	29	+	```
	30	+
	31	+	The script will automatically scrape the data from the links in the original file what will approximately take 150mb on your disk
	32	+
23	33		Navalny list:
24	34
25	35		The parsed data are taken from [OCCRP](https://www.occrp.org/en/daily/16253-navalny-s-foundation-lists-putin-s-6-000-bribe-takers-and-warmongers)
		skipped 2 lines

■ ■ ■ ■ ■ ■

sanctions_and_peps/every_politician_parser.py

	1	+	import argparse
	2	+	import csv
	3	+	import json
	4	+	import urllib.request as request
	5	+	import pathlib
	6	+
	7	+	POLITICIANS_JSON_DIR = "politicians_raw_jsons"
	8	+	POLITICIANS_PARSED_CSV = "politicians_parsed.csv"
	9	+
	10	+	def parse_args():
	11	+	parser = argparse.ArgumentParser()
	12	+	parser.add_argument("-i", "--input", type=str, required=True)
	13	+	parser.add_argument("-o", "--out", type=str, required=True)
	14	+	return parser.parse_args()
	15	+
	16	+
	17	+	def download_data(input_path, out_path):
	18	+	path_w_jsons = out_path / POLITICIANS_JSON_DIR
	19	+	if not path_w_jsons.exists():
	20	+	path_w_jsons.mkdir()
	21	+	with open(input_path, "r") as f:
	22	+	data = json.load(f)
	23	+	for country in data:
	24	+	country_name = country["name"].lower()
	25	+	for i, legislature in enumerate(country["legislatures"]):
	26	+	with request.urlopen(legislature["popolo_url"]) as url:
	27	+	names = json.loads(url.read())
	28	+	file_name = f"{country_name}_{i}.json"
	29	+	with open(path_w_jsons / file_name, "w") as json_2_write:
	30	+	json.dump(names, json_2_write)
	31	+
	32	+
	33	+	def parse_person(person_dict):
	34	+	return [person_dict["id"], person_dict["name"], person_dict.get("birth_date")]
	35	+
	36	+
	37	+	def extract_entites(path):
	38	+	path_w_jsons = path / POLITICIANS_JSON_DIR
	39	+	all_politicians = []
	40	+	for json_path in path_w_jsons.iterdir():
	41	+	with open(json_path, "r") as f:
	42	+	data = json.load(f)
	43	+	country = json_path.name.split("_")[0]
	44	+	persons = [parse_person(person) + [country] for person in data["persons"]]
	45	+	# we do not extract data["organizations"]
	46	+	all_politicians.extend(persons)
	47	+	return all_politicians
	48	+
	49	+	def main():
	50	+	args = parse_args()
	51	+
	52	+	path_out = pathlib.Path(args.out)
	53	+	if not (path_out / POLITICIANS_JSON_DIR).exists():
	54	+	print("Downloading jsons...")
	55	+	download_data(pathlib.Path(args.input), pathlib.Path(args.out))
	56	+
	57	+	entities = extract_entites(path_out)
	58	+
	59	+	header = ["ID", "NAME", "DOB", "COUNTRY"]
	60	+
	61	+	with open(path_out / POLITICIANS_PARSED_CSV, "w") as f:
	62	+	writer = csv.writer(f)
	63	+	writer.writerow(header)
	64	+	for row in entities:
	65	+	writer.writerow(row)
	66	+
	67	+	if __name__=="__main__":
	68	+	main()

sanctions_and_peps/parsed/politicians_parsed.csv

Unable to diff as the file is too large.
sanctions_and_peps/source/every_politician.json

Diff is too large to be displayed.

■ ■ ■ ■ ■ ■

sanctions_and_peps/un_parser.py

		skipped 1 lines
2	2		import csv
3	3
4	4		from lxml import etree
5		-	from typing import Dict, List, Optional, Union
6		-	from enum import Enum
7	5
8	6		def parse_args():
9	7		parser = argparse.ArgumentParser()
		skipped 119 lines

Please wait...

Page is in error, reload to recover