STRLCPY/451-CorporateRiskMiner

Merge branch 'main' into update
ElenaDulskyte committed with GitHub 2 years ago

1b607d9d

2 parents
526bb8ce
d7060e72

■ ■ ■ ■ ■ ■

README.md

1	1		# 451 Corporate Risk Miner
2	2
3	3		## Team Members
4		-	Elena Dulskyte [github](https://github.com/ElenaDulskyte) [linkedin](https://www.linkedin.com/in/elena-dulskyte-50b83aa2/)
	4	+	Elena Dulskyte [github](https://github.com/ElenaDulskyte) [linkedin](https://www.linkedin.com/in/elena-dulskyte-50b83aa2/), Senior Data Scientist at ComplyAdvantage
5	5
6		-	Marko Sahan [github](http://github.com/sahanmar) [linkedin](https://www.linkedin.com/in/msahan/)
	6	+	Marko Sahan [github](http://github.com/sahanmar) [linkedin](https://www.linkedin.com/in/msahan/), Machine Learning Engineer at ComplyAdvantage
7	7
8		-	Peter Zatka-Haas [github](http://github.com/peterzh) [linkedin](https://www.linkedin.com/in/peterzatkahaas)
	8	+	Peter Zatka-Haas [github](http://github.com/peterzh) [linkedin](https://www.linkedin.com/in/peterzatkahaas), Data Scientist at ComplyAdvantage
9	9
10	10		## Tool Description
11	11
		skipped 27 lines
39	39		pip install -r requirements.txt
40	40		```
41	41
42		-	4. Start the streamlit app
	42	+	4. Download dataset from XXXX to `<root dir>/data`
	43	+
	44	+	5. Start the streamlit app
43	45		```
44	46		streamlit run app/app.py
45	47		```
46	48
47		-	5. On your web browser, load [http://localhost:8501](http://localhost:8501)
	49	+	6. On your web browser, load [http://localhost:8501](http://localhost:8501)
48	50
49	51		## Usage
50	52
		skipped 19 lines
70	72		### Potential next steps
71	73		* Expand to corporate ownership databases outside of the UK, for example using OpenCorporates data.
72	74		* Incorporate more external data sources identifying criminal or potentially-criminal activity for companies and people.
73		-	* Allow user to input custom lists.
	75	+	* Allow user to input custom lists of interesting people/companies. This would allow journalist to see only networks that potentially contains people of specific interest. Eg. Navalny list of 6000 russian war supporters
	76	+
74	77

■ ■ ■ ■ ■ ■

app/app.py

		skipped 1 lines
2	2		import pandas as pd
3	3		import streamlit as st
4	4		from streamlit_agraph import agraph, Config
	5	+
5	6		from utils import (
6	7		build_agraph_components,
7	8		get_subgraph_nodes_df,
		skipped 3 lines
11	12		build_markdown_strings_for_node,
12	13		)
13	14
14		-
15	15		st.set_page_config(layout="wide")
16		-
17	16
18	17		SLIDER_MIN = 0
19	18		SLIDER_MAX = 100
20	19		SLIDER_DEFAULT = 50
21	20		DEFAULT_NUM_SUBGRAPHS_TO_SHOW = 3
22	21		GRAPH_PLOT_HEIGHT_PX = 400
23		-	GRAPH_SIZE_RENDER_LIMIT = 40
	22	+	GRAPH_SIZE_RENDER_LIMIT = 50
24	23		subgraphs = get_subgraph_df()
25	24
26	25		with st.sidebar:
27	26		st.title("451 Corporate Risk Miner")
28	27
29		-	weight_chains = (
	28	+	only_include_small_subnetworks = st.checkbox(
	29	+	f"Only include networks small enough to render (<{GRAPH_SIZE_RENDER_LIMIT} nodes)",
	30	+	value=True,
	31	+	)
	32	+
	33	+	weight_cyclic = (
30	34		st.slider(
31		-	"Long ownership chains",
	35	+	"Cyclic ownership",
32	36		min_value=SLIDER_MIN,
33	37		max_value=SLIDER_MAX,
34	38		value=SLIDER_DEFAULT,
35		-	disabled=True,
36	39		)
37	40		/ SLIDER_MAX
38	41		)
39		-	weight_cyclic = (
	42	+	weight_company_ratio = (
40	43		st.slider(
41		-	"Cyclic ownership",
	44	+	"High company:officer ratio",
42	45		min_value=SLIDER_MIN,
43	46		max_value=SLIDER_MAX,
44	47		value=SLIDER_DEFAULT,
45	48		)
46	49		/ SLIDER_MAX
47	50		)
48		-	weight_psc_haven = (
	51	+	weight_proxy_directors = (
49	52		st.slider(
50		-	"Persons of significant control associated with tax havens",
	53	+	"Presence of proxy directors",
51	54		min_value=SLIDER_MIN,
52	55		max_value=SLIDER_MAX,
53	56		value=SLIDER_DEFAULT,
54		-	disabled=True,
55	57		)
56	58		/ SLIDER_MAX
57	59		)
58		-	weight_pep = (
	60	+	weight_multi_jurisdiction = (
59	61		st.slider(
60		-	"Officers/PSCs are politically exposed",
	62	+	"Associated with multiple jurisdictions",
61	63		min_value=SLIDER_MIN,
62	64		max_value=SLIDER_MAX,
63	65		value=SLIDER_DEFAULT,
64		-	disabled=True,
65	66		)
66	67		/ SLIDER_MAX
67	68		)
68		-	weight_sanctions = (
	69	+	weight_tax_havens = (
69	70		st.slider(
70		-	"Officers/PSCs/Companies are sanctioned",
	71	+	"Associated with tax havens",
71	72		min_value=SLIDER_MIN,
72	73		max_value=SLIDER_MAX,
73	74		value=SLIDER_DEFAULT,
74		-	disabled=True,
75	75		)
76	76		/ SLIDER_MAX
77	77		)
78		-	weight_disqualified = (
	78	+	weight_pep = (
79	79		st.slider(
80		-	"Officers are disqualified directors",
	80	+	"Officers/PSCs are politically exposed persons",
81	81		min_value=SLIDER_MIN,
82	82		max_value=SLIDER_MAX,
83	83		value=SLIDER_DEFAULT,
84		-	disabled=True,
85	84		)
86	85		/ SLIDER_MAX
87	86		)
88		-
89		-	custom_names = st.file_uploader(
90		-	label="Custom persons/companies of interest", type="csv"
	87	+	weight_russian_pep = (
	88	+	st.slider(
	89	+	"Officers/PSCs are Russian politically exposed persons",
	90	+	min_value=SLIDER_MIN,
	91	+	max_value=SLIDER_MAX,
	92	+	value=SLIDER_DEFAULT,
	93	+	)
	94	+	/ SLIDER_MAX
91	95		)
92	96
93		-	if custom_names:
94		-	custom_names = pd.read_csv(custom_names, header=None)[0].tolist()
95		-	st.write(custom_names)
96		-
97		-	go = st.button("Go")
98		-
99		-
100	97		with st.container():
101	98
102	99		subgraph_with_risk_scores = get_subgraph_with_risk_score(
103	100		subgraphs,
104		-	weight_chains=weight_chains,
105	101		weight_cyclic=weight_cyclic,
106		-	weight_psc_haven=weight_psc_haven,
	102	+	weight_company_ratio=weight_company_ratio,
	103	+	weight_proxy_directors=weight_proxy_directors,
	104	+	weight_multi_jurisdiction=weight_multi_jurisdiction,
	105	+	weight_tax_havens=weight_tax_havens,
107	106		weight_pep=weight_pep,
108		-	weight_sanctions=weight_sanctions,
109		-	weight_disqualified=weight_disqualified,
	107	+	weight_russian_pep=weight_russian_pep,
110	108		)
111	109
112		-	st.dataframe(data=subgraph_with_risk_scores, use_container_width=True)
	110	+	if only_include_small_subnetworks:
	111	+	subgraph_with_risk_scores = subgraph_with_risk_scores.query(
	112	+	"node_num < @GRAPH_SIZE_RENDER_LIMIT"
	113	+	)
	114	+
	115	+	# Only show top 2000 networks
	116	+	subgraph_with_risk_scores = subgraph_with_risk_scores.head(2000)
	117	+
	118	+	st.dataframe(
	119	+	data=subgraph_with_risk_scores,
	120	+	use_container_width=True,
	121	+	)
113	122
114	123		selected_subgraph_hashes = st.multiselect(
115	124		label="Select corporate network(s) to explore",
		skipped 23 lines
139	148		nodes=node_objects,
140	149		edges=edge_objects,
141	150		config=Config(
142		-	width=round(1080 / num_subgraphs_to_display),
	151	+	width=round(1920 / num_subgraphs_to_display),
143	152		height=GRAPH_PLOT_HEIGHT_PX,
144	153		nodeHighlightBehavior=True,
145	154		highlightColor="#F7A7A6",
146	155		directed=True,
147	156		collapsible=True,
	157	+	physics={
	158	+	"enabled": True,
	159	+	"maxVelocity": 5,
	160	+	},
148	161		),
149	162		)
150	163		else:
151	164		st.error("Subgraph is too large to render")
152	165
153		-	# Build markdown strings for representing metadata
	166	+	# Build markdown strings for representing metadata on dodgy entities
154	167		markdown_strings = build_markdown_strings_for_node(nodes_selected)
155	168
156	169		st.markdown(":busts_in_silhouette: People")
		skipped 4 lines
161	174		for c in markdown_strings["companies"]:
162	175		st.markdown(c)
163	176
	177	+	st.download_button(
	178	+	"Download subnetwork",
	179	+	nodes_selected.to_csv().encode("utf-8"),
	180	+	file_name=f"{subgraph_hash}.csv",
	181	+	)
	182	+

■ ■ ■ ■ ■ ■

app/utils.py

1	1		import streamlit as st
2	2		from streamlit_agraph import Node, Edge
3		-	import json
	3	+
4	4		import pandas as pd
5	5
6		-	NODE_COLOUR_NON_DODGY = "#72EF77"
7		-	NODE_COLOUR_DODGY = "#F63333"
8		-	NODE_IMAGE_PERSON = "http://i.ibb.co/LrY3tfw/747376.png" # https://www.flaticon.com/free-icon/user_747376
9		-	NODE_IMAGE_COMPANY = "http://i.ibb.co/fx6r1dZ/4812244.png" # https://www.flaticon.com/free-icon/company_4812244
	6	+	NODE_COLOUR_NON_DODGY = "#D1E2F8"
	7	+	NODE_COLOUR_DODGY = "#F99292"
	8	+	NODE_IMAGE_PERSON = "https://raw.githubusercontent.com/sahanmar/451/main/data/person_icon.png" # https://www.flaticon.com/free-icon/user_747376
	9	+	NODE_IMAGE_COMPANY = "https://raw.githubusercontent.com/sahanmar/451/main/data/company_icon.png" # https://www.flaticon.com/free-icon/company_4812244
10	10
11	11
12	12		@st.cache()
13	13		def get_subgraph_df():
14		-	return pd.read_parquet("./data/network.parquet", engine="pyarrow").set_index(
15		-	"network_id"
16		-	)
	14	+	subgraphs = pd.read_parquet(
	15	+	"./data/1m_networks.parquet", engine="pyarrow"
	16	+	).set_index("network_id")
	17	+	return subgraphs
17	18
18	19
19	20		@st.cache()
20	21		def get_subgraph_nodes_df(subgraph_hash):
21		-	return pd.read_parquet(
22		-	"./data/nodes.parquet",
	22	+	nodes = pd.read_parquet(
	23	+	"./data/1m_nodes.parquet",
23	24		filters=[[("subgraph_hash", "=", subgraph_hash)]],
24	25		engine="pyarrow",
25	26		)
	27	+
	28	+	return nodes
26	29
27	30
28	31		@st.cache()
29	32		def get_subgraph_edges_df(subgraph_hash):
30	33		return pd.read_parquet(
31		-	"./data/edges.parquet",
	34	+	"./data/1m_edges.parquet",
32	35		filters=[[("subgraph_hash", "=", subgraph_hash)]],
33	36		engine="pyarrow",
34	37		)
35	38
36	39
	40	+	@st.cache()
37	41		def get_subgraph_with_risk_score(
38	42		subgraph_table,
39		-	weight_chains,
40	43		weight_cyclic,
41		-	weight_psc_haven,
	44	+	weight_company_ratio,
	45	+	weight_proxy_directors,
	46	+	weight_multi_jurisdiction,
	47	+	weight_tax_havens,
42	48		weight_pep,
43		-	weight_sanctions,
44		-	weight_disqualified,
	49	+	weight_russian_pep,
45	50		):
46	51
47	52		out = subgraph_table.copy()
48		-	out["total_risk"] = out["cyclicity"] * weight_cyclic / out["cyclicity"].max()
49		-	return out.sort_values(by="total_risk", ascending=False)
	53	+	out["total_risk"] = (
	54	+	(out["cyclicity"] * weight_cyclic / out["cyclicity"].max())
	55	+	+ (out["company_ratio"] * weight_company_ratio / out["company_ratio"].max())
	56	+	+ (
	57	+	out["multi_jurisdiction"]
	58	+	* weight_multi_jurisdiction
	59	+	/ out["multi_jurisdiction"].max()
	60	+	)
	61	+	+ (out["tax_haven"] * weight_tax_havens / out["tax_haven"].max())
	62	+	+ (out["proxy"] * weight_proxy_directors / out["proxy"].max())
	63	+	+ (out["potential_pep_match"] * weight_pep / out["potential_pep_match"].max())
	64	+	+ (
	65	+	out["potential_rus_pep_match"]
	66	+	* weight_russian_pep
	67	+	/ out["potential_rus_pep_match"].max()
	68	+	)
	69	+	)
	70	+	return out.sort_values(by="total_risk", ascending=False).query("total_risk > 0")
50	71
51	72
52	73		def build_agraph_components(
		skipped 6 lines
59	80		edge_objects = []
60	81
61	82		for _, row in nodes.iterrows():
62		-	# node_metadata = json.loads(row["node_metadata"])
	83	+
	84	+	node_features = parse_node_features(row)
	85	+
63	86		node_objects.append(
64	87		Node(
65	88		id=row["node_id"],
66		-	label="\n".join(row["node_id"].split("\|")[0].split(" ")),
	89	+	label=node_features["name"],
67	90		size=20,
68		-	# color=NODE_COLOUR_DODGY
69		-	# if (row["pep"] > 0 or row["sanction"] > 0)
70		-	# else NODE_COLOUR_NON_DODGY,
71		-	image=NODE_IMAGE_PERSON,
72		-	# if row["is_person"] == 1
73		-	# else NODE_IMAGE_COMPANY,
	91	+	color=NODE_COLOUR_DODGY
	92	+	if node_features["is_dodgy"]
	93	+	else NODE_COLOUR_NON_DODGY,
	94	+	image=NODE_IMAGE_PERSON
	95	+	if node_features["is_person"]
	96	+	else NODE_IMAGE_COMPANY,
74	97		shape="circularImage",
75	98		)
76	99		)
		skipped 2 lines
79	102		edge_objects.append(
80	103		Edge(
81	104		source=row["source"],
82		-	# label=row["type"][0],
	105	+	label=row["type"][0],
83	106		target=row["target"],
	107	+	smooth=True,
84	108		)
85	109		)
86	110
87	111		return (node_objects, edge_objects)
88	112
89	113
	114	+	@st.cache()
90	115		def build_markdown_strings_for_node(nodes_selected):
91	116		"""Separate into People and Company strings"""
92	117
		skipped 2 lines
95	120		markdown_strings["people"] = []
96	121
97	122		for _, row in nodes_selected.iterrows():
98		-	node_metadata = {
99		-	"name": row["node_id"],
100		-	"is_proxy": row["proxy_dir"],
101		-	"is_person": True,
102		-	}
103		-
104		-	# node_metadata = json.loads(row["node_metadata"])
105		-	# node_sanctions = (
106		-	# "" if row["sanction"] == 0 else f"! SANCTIONED: {row['sanction_metadata']}"
107		-	# )
108		-	# node_pep = "" if row["pep"] == 0 else f"! PEP: {row['pep_metadata']}"
109	123
110		-	node_sanctions = ""
111		-	node_pep = ""
	124	+	node_features = parse_node_features(row)
112	125
113		-	if node_metadata["is_person"]:
114		-	# node_title = f"{node_metadata['name']} [{node_metadata['nationality']}/{node_metadata['yob']}/{node_metadata['mob']}]"
115		-	node_title = f"{node_metadata['name']}"
	126	+	if node_features["is_person"]:
	127	+	text = f"{node_features['name']} [{node_features['nationality']}/{node_features['yob']}/{node_features['mob']}]"
116	128		key = "people"
117	129		else:
118		-	# node_title = f"{node_metadata['name']} [{row['jur']}/{node_metadata['reg']}/{node_metadata['address']}]"
119		-	node_title = f"{node_metadata['name']}"
	130	+	text = f"{node_features['name']} [{node_features['company_number']}/{node_features['country']}]"
120	131		key = "companies"
121	132
122		-	markdown_strings[key].append(
123		-	"\n".join(
124		-	[x for x in ["```", node_title, node_pep, node_sanctions] if len(x) > 0]
125		-	)
126		-	)
	133	+	if node_features["is_proxy"]:
	134	+	text += "\n !! Proxy director !!"
	135	+
	136	+	if node_features["is_pep"]:
	137	+	text += "\n !! Politically-exposed person !!"
	138	+	text += f"\n -> {row['politician_metadata']}"
	139	+
	140	+	if node_features["is_rus_pep"]:
	141	+	text += "\n !! Russian politically-exposed person !!"
	142	+	text += f"\n -> {row['rus_politician_metadata']}"
	143	+
	144	+	markdown_strings[key].append(f"```\n{text}")
127	145
128	146		return markdown_strings
129	147
	148	+
	149	+	@st.cache()
	150	+	def parse_node_features(row):
	151	+	out = dict()
	152	+
	153	+	out["is_person"] = row["is_person"] == 1
	154	+	out["is_proxy"] = row["proxy_dir"] == 1
	155	+	out["is_pep"] = row["politician"] == 1
	156	+	out["is_rus_pep"] = row["rus_politician"] == 1
	157	+	out["is_dodgy"] = out["is_proxy"] or out["is_pep"] or out["is_rus_pep"]
	158	+
	159	+	if row["node_metadata"]:
	160	+	node_metadata = row["node_metadata"]
	161	+	else:
	162	+	node_metadata = {"forenames": None, "surname": None, "name": None}
	163	+
	164	+	if out["is_person"]:
	165	+	if node_metadata["forenames"] and node_metadata["surname"]:
	166	+	out["name"] = node_metadata["forenames"] + " " + node_metadata["surname"]
	167	+	else:
	168	+	out["name"] = row["node_id"]
	169	+	else:
	170	+	if node_metadata["name"]:
	171	+	out["name"] = node_metadata["name"]
	172	+	else:
	173	+	out["name"] = node_metadata["surname"]
	174	+
	175	+	raw_metadata = row["node_metadata"] if row["node_metadata"] else {}
	176	+	raw_metadata = {k: v for k, v in raw_metadata.items() if v is not None}
	177	+
	178	+	out["address"] = raw_metadata.get("address", "")
	179	+	out["country"] = raw_metadata.get("country", "")
	180	+	out["yob"] = raw_metadata.get("yob", "")
	181	+	out["mob"] = raw_metadata.get("mob", "")
	182	+	out["nationality"] = raw_metadata.get("nationality", "")
	183	+	out["company_number"] = raw_metadata.get("company_number", "")
	184	+
	185	+	return out
	186	+

data/company_icon.png
data/person_icon.png

■ ■ ■ ■ ■ ■

requirements.txt

		skipped 54 lines
55	55		pyparsing==3.0.9
56	56		pyrsistent==0.18.1
57	57		python-dateutil==2.8.2
	58	+	python-decouple==3.6
58	59		pytz==2022.2.1
59	60		pytz-deprecation-shim==0.1.0.post0
60	61		pyzmq==24.0.1
		skipped 6 lines
67	68		smmap==5.0.0
68	69		stack-data==0.5.0
69	70		streamlit==1.13.0
	71	+	streamlit-aggrid==0.3.3
70	72		streamlit-agraph==0.0.42
71	73		toml==0.10.2
72	74		toolz==0.12.0
		skipped 4 lines
77	79		tzlocal==4.2
78	80		urllib3==1.26.12
79	81		validators==0.20.0
	82	+	watchdog==2.1.9
80	83		wcwidth==0.2.5
81	84		zipp==3.8.1
82	85

Merge branch 'main' into update