STRLCPY/451-CorporateRiskMiner

data processing step
elena.dulskyte committed 2 years ago

b2491fc1

1 parent 404f58fe

■ ■ ■ ■ ■ ■

data_cache/.ipynb_checkpoints/build_data_cache-checkpoint.ipynb

1	+	{
2	+	"cells": [
3	+	{
4	+	"cell_type": "markdown",
5	+	"id": "9b424ba0-9394-4411-a475-cffe1d9c7fce",
6	+	"metadata": {},
7	+	"source": [
8	+	"## Create Data Cache:"
9	+	]
10	+	},
11	+	{
12	+	"cell_type": "code",
13	+	"execution_count": 5,
14	+	"id": "d77b1094-5c00-4c46-8065-0e9892e15028",
15	+	"metadata": {
16	+	"iooxa": {
17	+	"id": {
18	+	"block": "FrkUPeD1VXG9V8zSSfh8",
19	+	"project": "cNRcIFOMOBAHj5O57Joq",
20	+	"version": 1
21	+	},
22	+	"outputId": {
23	+	"block": "zlAgkbC1M5B0QnHjKdn0",
24	+	"project": "cNRcIFOMOBAHj5O57Joq",
25	+	"version": 1
26	+	}
27	+	}
28	+	},
29	+	"outputs": [
30	+	{
31	+	"name": "stderr",
32	+	"output_type": "stream",
33	+	"text": [
34	+	"UKCH Company er map: 100%\|██████████\| 5107631/5107631 [00:05<00:00, 953235.19it/s] \n",
35	+	"Officer (person) er map: 100%\|██████████\| 10035057/10035057 [00:23<00:00, 422416.04it/s]\n",
36	+	"Officer (company) er map: 100%\|██████████\| 313158/313158 [00:00<00:00, 756575.34it/s]\n",
37	+	"PSC (company) er map: 100%\|██████████\| 702472/702472 [00:00<00:00, 859221.68it/s]\n",
38	+	"PSC (person) er map: 100%\|██████████\| 9012596/9012596 [00:25<00:00, 355636.44it/s]"
39	+	]
40	+	},
41	+	{
42	+	"name": "stdout",
43	+	"output_type": "stream",
44	+	"text": [
45	+	"CPU times: user 2min 33s, sys: 48.9 s, total: 3min 21s\n",
46	+	"Wall time: 4min 29s\n"
47	+	]
48	+	},
49	+	{
50	+	"name": "stderr",
51	+	"output_type": "stream",
52	+	"text": [
53	+	"\n"
54	+	]
55	+	}
56	+	],
57	+	"source": [
58	+	"%%time\n",
59	+	"from data_cache.utils import ProduceEntityResolution\n",
60	+	"from data_cache.schema import schema\n",
61	+	"\n",
62	+	"from unidecode import unidecode\n",
63	+	"import pandas as pd\n",
64	+	"\n",
65	+	"# For more info on sources, see data_cache/DATA_SCHEMA_README:\n",
66	+	"company_df = pd.read_parquet(schema['ukch_companies'])\n",
67	+	"officer_df = pd.read_parquet(schema['ukch_officers'])\n",
68	+	"psc_company_df = pd.read_parquet(schema['psc_company'])\n",
69	+	"psc_person_df = pd.read_parquet(schema['psc_person'])\n",
70	+	"\n",
71	+	"all_politicians = pd.read_csv(schema['politicians_parsed'])\n",
72	+	"all_politicians = all_politicians.where(pd.notnull(all_politicians), None)\n",
73	+	"\n",
74	+	"ru_bl_peps = pd.read_csv(schema['ru_bl_peps_parsed'])\n",
75	+	"ru_bl_peps = ru_bl_peps.where(pd.notnull(ru_bl_peps), None)\n",
76	+	"\n",
77	+	"un_sanctions = pd.read_csv(schema['un_parsed'])\n",
78	+	"un_sanctions = un_sanctions.where(pd.notnull(un_sanctions), None)\n",
79	+	"\n",
80	+	"per = ProduceEntityResolution(company_df)\n",
81	+	"per.resolve_entities(company_df, officer_df, psc_company_df, psc_person_df)"
82	+	]
83	+	},
84	+	{
85	+	"cell_type": "markdown",
86	+	"id": "eb76af3a-8a7b-4864-b034-f30ba8d69e4e",
87	+	"metadata": {
88	+	"iooxa": {
89	+	"id": {
90	+	"block": "8Ky77gWt42j1Xau0trNJ",
91	+	"project": "cNRcIFOMOBAHj5O57Joq",
92	+	"version": 1
93	+	}
94	+	}
95	+	},
96	+	"source": [
97	+	"## Graph Building and Breaking down into subnetworks:"
98	+	]
99	+	},
100	+	{
101	+	"cell_type": "code",
102	+	"execution_count": 6,
103	+	"id": "af6a9b94-1c68-4fa1-959d-db497d796de3",
104	+	"metadata": {
105	+	"iooxa": {
106	+	"id": {
107	+	"block": "Z2iDuhiAMrSLfhPmW232",
108	+	"project": "cNRcIFOMOBAHj5O57Joq",
109	+	"version": 1
110	+	},
111	+	"outputId": {
112	+	"block": "as1z2tZl1DsTqxwV8anY",
113	+	"project": "cNRcIFOMOBAHj5O57Joq",
114	+	"version": 1
115	+	}
116	+	}
117	+	},
118	+	"outputs": [
119	+	{
120	+	"name": "stderr",
121	+	"output_type": "stream",
122	+	"text": [
123	+	"PSC company graph: 702472it [00:03, 180621.51it/s]\n",
124	+	"PSC person graph: 9012596it [01:01, 145691.31it/s]\n",
125	+	"Officer graph: 10348215it [00:42, 241512.75it/s]\n"
126	+	]
127	+	},
128	+	{
129	+	"name": "stdout",
130	+	"output_type": "stream",
131	+	"text": [
132	+	"Top 10 Connected component sizes: [4373053, 1630, 1313, 760, 753, 512, 430, 409, 355, 346]\n"
133	+	]
134	+	},
135	+	{
136	+	"name": "stderr",
137	+	"output_type": "stream",
138	+	"text": [
139	+	"Breaking down Giant CC (size 4373053): 100%\|██████████\| 4373053/4373053 [05:18<00:00, 13723.90it/s]\n"
140	+	]
141	+	},
142	+	{
143	+	"name": "stdout",
144	+	"output_type": "stream",
145	+	"text": [
146	+	"Giant Component of size 4373053 was broken down.\n",
147	+	" Added 3139321 neighbourhoods, \n",
148	+	" Sum of all nodes = 217725726\n",
149	+	" Overhead ratio=49.78803732769761\n",
150	+	"\n"
151	+	]
152	+	},
153	+	{
154	+	"name": "stderr",
155	+	"output_type": "stream",
156	+	"text": [
157	+	"Breaking down Giant CC (size 1630): 100%\|██████████\| 1630/1630 [00:00<00:00, 8143.89it/s]\n"
158	+	]
159	+	},
160	+	{
161	+	"name": "stdout",
162	+	"output_type": "stream",
163	+	"text": [
164	+	"Giant Component of size 1630 was broken down.\n",
165	+	" Added 1443 neighbourhoods, \n",
166	+	" Sum of all nodes = 330985\n",
167	+	" Overhead ratio=203.05828220858896\n",
168	+	"\n"
169	+	]
170	+	},
171	+	{
172	+	"name": "stderr",
173	+	"output_type": "stream",
174	+	"text": [
175	+	"Breaking down Giant CC (size 1313): 100%\|██████████\| 1313/1313 [00:00<00:00, 2181.59it/s]\n"
176	+	]
177	+	},
178	+	{
179	+	"name": "stdout",
180	+	"output_type": "stream",
181	+	"text": [
182	+	"Giant Component of size 1313 was broken down.\n",
183	+	" Added 84 neighbourhoods, \n",
184	+	" Sum of all nodes = 16293\n",
185	+	" Overhead ratio=12.408987052551408\n",
186	+	"\n",
187	+	"CPU times: user 7min 51s, sys: 14.1 s, total: 8min 5s\n",
188	+	"Wall time: 8min 3s\n"
189	+	]
190	+	}
191	+	],
192	+	"source": [
193	+	"%%time\n",
194	+	"from data_cache.utils import GraphBuilder\n",
195	+	"gb = GraphBuilder()\n",
196	+	"gb.build(per, psc_company_df, psc_person_df, officer_df)\n",
197	+	"gb.break_into_subgraphs(1000)"
198	+	]
199	+	},
200	+	{
201	+	"cell_type": "markdown",
202	+	"id": "17fb8fcd-feb8-4ee3-b561-804813d86dd0",
203	+	"metadata": {},
204	+	"source": [
205	+	"### Add node describtors for risk calculation later:"
206	+	]
207	+	},
208	+	{
209	+	"cell_type": "code",
210	+	"execution_count": 7,
211	+	"id": "503789de-af85-466b-890d-c379e8da8c4e",
212	+	"metadata": {
213	+	"iooxa": {
214	+	"id": {
215	+	"block": "Dag1oRYbibJPCEAO30P2",
216	+	"project": "cNRcIFOMOBAHj5O57Joq",
217	+	"version": 1
218	+	},
219	+	"outputId": {
220	+	"block": "jzcbgB4EZdoFec0VdJhm",
221	+	"project": "cNRcIFOMOBAHj5O57Joq",
222	+	"version": 1
223	+	}
224	+	}
225	+	},
226	+	"outputs": [
227	+	{
228	+	"name": "stderr",
229	+	"output_type": "stream",
230	+	"text": [
231	+	"company: 100%\|██████████\| 5107631/5107631 [00:46<00:00, 110139.75it/s]\n",
232	+	"officer_person: 100%\|██████████\| 10035057/10035057 [00:29<00:00, 342760.19it/s]\n",
233	+	"officer_company: 100%\|██████████\| 313158/313158 [00:00<00:00, 626877.37it/s]\n",
234	+	"psc_person: 100%\|██████████\| 9012596/9012596 [00:22<00:00, 397809.45it/s]\n",
235	+	"psc_company: 100%\|██████████\| 702472/702472 [00:01<00:00, 491691.74it/s]\n"
236	+	]
237	+	},
238	+	{
239	+	"name": "stdout",
240	+	"output_type": "stream",
241	+	"text": [
242	+	"CPU times: user 1min 51s, sys: 5.37 s, total: 1min 57s\n",
243	+	"Wall time: 1min 56s\n"
244	+	]
245	+	}
246	+	],
247	+	"source": [
248	+	"%%time\n",
249	+	"%load_ext autoreload\n",
250	+	"%autoreload 2\n",
251	+	"from data_cache.utils import NodeDescriber\n",
252	+	"\n",
253	+	"nd = NodeDescriber(per)\n",
254	+	"nd.add_metadata(company_df, officer_df, psc_company_df, psc_person_df)"
255	+	]
256	+	},
257	+	{
258	+	"cell_type": "markdown",
259	+	"id": "76448310-b857-459d-b6e1-6185dc908d9a",
260	+	"metadata": {},
261	+	"source": [
262	+	"### Find if entities have possible matches against Politicians datasets"
263	+	]
264	+	},
265	+	{
266	+	"cell_type": "code",
267	+	"execution_count": null,
268	+	"id": "7f4c8507-2805-4706-aaf8-07899b177da9",
269	+	"metadata": {},
270	+	"outputs": [],
271	+	"source": [
272	+	"PEP, RUS = {}, {}\n",
273	+	"for name, dob, country in zip(all_politicians.NAME, all_politicians.DOB, all_politicians.COUNTRY):\n",
274	+	" name = unidecode(name).lower()\n",
275	+	" PEP[name] = {\"country\": country, \"source\": \"every_politician\"}\n",
276	+	" if isinstance(dob, str):\n",
277	+	" PEP[name][\"yob\"] = int(dob[:4])\n",
278	+	" if len(dob) == 10:\n",
279	+	" PEP[name][\"mob\"] = int(dob[5:7])\n",
280	+	"for name, dob, cat, tx in zip(ru_bl_peps.NAME_EN, ru_bl_peps.DOB, ru_bl_peps.CATEGORY, ru_bl_peps.TAXPAYER_NUM):\n",
281	+	" name = unidecode(name).lower()\n",
282	+	" RUS[name] = {\"country\": \"RU/BY\", \"category\": cat, \"taxpayer_num\": tx, \"source\": \"rupep.org\"}\n",
283	+	" if isinstance(dob, str) and len(dob) == 10:\n",
284	+	" RUS[name][\"yob\"] = int(dob[-4:])\n",
285	+	" RUS[name][\"mob\"] = int(dob[3:5])"
286	+	]
287	+	},
288	+	{
289	+	"cell_type": "markdown",
290	+	"id": "276470f1-084f-44c7-bc30-fa3a6905c170",
291	+	"metadata": {
292	+	"iooxa": {
293	+	"id": {
294	+	"block": "27gIHsImOPFu9FiC5HJe",
295	+	"project": "cNRcIFOMOBAHj5O57Joq",
296	+	"version": 1
297	+	}
298	+	}
299	+	},
300	+	"source": [
301	+	"## Build subnetwork stats:"
302	+	]
303	+	},
304	+	{
305	+	"cell_type": "code",
306	+	"execution_count": 12,
307	+	"id": "b78891af-f60f-415f-8fc4-dbd0ab0487f6",
308	+	"metadata": {
309	+	"iooxa": {
310	+	"id": {
311	+	"block": "EETu5XmgZCjgCT2mtRmt",
312	+	"project": "cNRcIFOMOBAHj5O57Joq",
313	+	"version": 1
314	+	},
315	+	"outputId": {
316	+	"block": "zkzN2j3pJpxUBmPjXebN",
317	+	"project": "cNRcIFOMOBAHj5O57Joq",
318	+	"version": 1
319	+	}
320	+	},
321	+	"tags": []
322	+	},
323	+	"outputs": [
324	+	{
325	+	"name": "stderr",
326	+	"output_type": "stream",
327	+	"text": [
328	+	"Precomputing risk signals: 100%\|██████████\| 1000000/1000000 [50:37<00:00, 329.22it/s] \n"
329	+	]
330	+	}
331	+	],
332	+	"source": [
333	+	"from tqdm import tqdm \n",
334	+	"import numpy as np\n",
335	+	"from utils import TAX_HEAVENS\n",
336	+	"\n",
337	+	"# How many networks to cache. UKCH Total in 2022 is about 7M.\n",
338	+	"N = 1_000_000\n",
339	+	"PROXY_TH = 50\n",
340	+	"PARTITION_SIZE = 1000\n",
341	+	"\n",
342	+	"subnetwork_ids = list(gb.hash_to_subn_map.keys())[:N]\n",
343	+	"\n",
344	+	"def count_company_ratio(ns):\n",
345	+	" return np.mean([not n.startswith(\"p\|\") for n in ns])\n",
346	+	"\n",
347	+	"def calculate_cyclicity(H):\n",
348	+	" if H.number_of_nodes() < 1:\n",
349	+	" print(f\"Non existent network: {netws}\")\n",
350	+	" return 0\n",
351	+	" E = H.number_of_edges()\n",
352	+	" N = H.number_of_nodes()\n",
353	+	" return (E + 1 - N)/(N*np.log(N))\n",
354	+	"\n",
355	+	"def netw_names(names):\n",
356	+	" names = set(names) - {None}\n",
357	+	" return \", \".join(sorted(names))\n",
358	+	"\n",
359	+	"def metadata_converter(md):\n",
360	+	" return \"; \".join([f\"{k}: {v}\" for k, v in md.items()])\n",
361	+	"\n",
362	+	"clc, node_num, dfs, company_ratio, entity_names, multi_jurisdiction, jur_names, netw_tax_haven = [], [], [], [], [], [], [], []\n",
363	+	"nodes, proxy, is_person, tax_haven, jur, node_metadata, netws = [], [], [], [], [], [], []\n",
364	+	"pep, pepm, r, rm, netw_pep, netw_r = [], [], [], [], [], []\n",
365	+	"for _id in tqdm(subnetwork_ids[:N], desc= \"Precomputing risk signals\"):\n",
366	+	" \n",
367	+	" # Get networkx subgraph:\n",
368	+	" nw = gb.hash_to_subn_map[_id]\n",
369	+	" H = gb.G_undir.subgraph(nw)\n",
370	+	" \n",
371	+	" # Network:\n",
372	+	" clc.append(calculate_cyclicity(H))\n",
373	+	" node_num.append(len(nw))\n",
374	+	" company_ratio.append(count_company_ratio(nw))\n",
375	+	" \n",
376	+	" # Edges:\n",
377	+	" df = pd.DataFrame(H.edges.data(\"edge_type\"), columns =['source', 'target', 'type'])\n",
378	+	" df['subgraph_hash'] =_id\n",
379	+	" df['subgraph_partition'] =_id % PARTITION_SIZE\n",
380	+	" dfs.append(df)\n",
381	+	" \n",
382	+	" # Nodes:\n",
383	+	" ns = gb.hash_to_subn_map[_id]\n",
384	+	" netw_jurs, netw_entity_names = [], []\n",
385	+	" netw_pep_value, netw_r_value = 0, 0\n",
386	+	" for n in ns:\n",
387	+	" nodes.append(n)\n",
388	+	" netws.append(_id)\n",
389	+	" proxy.append(int(gb.G_undir.degree[n] > PROXY_TH))\n",
390	+	" is_person.append(int(n.startswith(\"p\|\")))\n",
391	+	" node_metadata.append(nd.node_to_metadata.get(n, None))\n",
392	+	" \n",
393	+	" j = nd.node_to_jurs.get(n, set())\n",
394	+	" jur.append(\", \".join(sorted(j)))\n",
395	+	" tax_haven.append(int(len(j.intersection(TAX_HEAVENS)) > 0))\n",
396	+	" \n",
397	+	" netw_jurs.append(nd.node_to_jurs.get(n, None))\n",
398	+	" \n",
399	+	" name = nd.node_to_names.get(n, None)\n",
400	+	" netw_entity_names.append(name)\n",
401	+	" \n",
402	+	" if name is not None and name in RUS:\n",
403	+	" r.append(1)\n",
404	+	" rm.append(metadata_converter(RUS[name]))\n",
405	+	" netw_r_value += 1\n",
406	+	" else:\n",
407	+	" r.append(0)\n",
408	+	" rm.append(\"\")\n",
409	+	" \n",
410	+	" if name is not None and name in PEP:\n",
411	+	" pep.append(1)\n",
412	+	" pepm.append(metadata_converter(PEP[name]))\n",
413	+	" netw_pep_value += 1\n",
414	+	" else:\n",
415	+	" pep.append(0)\n",
416	+	" pepm.append(\"\")\n",
417	+	"\n",
418	+	" netw_pep.append(netw_pep_value)\n",
419	+	" netw_r.append(netw_r_value)\n",
420	+	" jurs_in_subnetwork = list(set().union(*[n for n in netw_jurs if n is not None]))\n",
421	+	" jur_names.append(\"; \".join(jurs_in_subnetwork))\n",
422	+	" netw_tax_haven.append(int(len(set(jurs_in_subnetwork).intersection(TAX_HEAVENS))> 0))\n",
423	+	" multi_jurisdiction.append(int(len(jurs_in_subnetwork) > 1))\n",
424	+	" entity_names.append(netw_names(netw_entity_names))\n"
425	+	]
426	+	},
427	+	{
428	+	"cell_type": "code",
429	+	"execution_count": 17,
430	+	"id": "efa05093-8e03-44d0-922f-e8e37302bc40",
431	+	"metadata": {},
432	+	"outputs": [
433	+	{
434	+	"name": "stdout",
435	+	"output_type": "stream",
436	+	"text": [
437	+	"CPU times: user 4min, sys: 12.2 s, total: 4min 12s\n",
438	+	"Wall time: 4min 11s\n"
439	+	]
440	+	}
441	+	],
442	+	"source": [
443	+	"%%time\n",
444	+	"PROXY_NETW_ID = set(nodes_df[nodes_df.proxy_dir==1].subgraph_hash)\n",
445	+	"subnetwork_df[\"proxy\"] = [int(s in PROXY_NETW_ID) for s in subnetwork_df.network_id.tolist()]\n",
446	+	"subnetwork_df = pd.DataFrame(data = {\n",
447	+	" \"network_id\": subnetwork_ids, \n",
448	+	" \"cyclicity\": clc, \n",
449	+	" \"node_num\": node_num, \n",
450	+	" \"company_ratio\": company_ratio,\n",
451	+	" \"multi_jurisdiction\": multi_jurisdiction, \n",
452	+	" \"tax_haven\": netw_tax_haven, \n",
453	+	" \"potential_pep_match\": netw_pep, \n",
454	+	" \"potential_rus_pep_match\": netw_r, \n",
455	+	" \"entity_names\": entity_names, \n",
456	+	" \"jur_names\": jur_names, \n",
457	+	"})\n",
458	+	"nodes_df = pd.DataFrame(data = {\n",
459	+	" 'node_id': nodes, \n",
460	+	" 'subgraph_hash': netws, \n",
461	+	" \"is_person\": is_person,\n",
462	+	" \"proxy_dir\": proxy, \n",
463	+	" \"node_metadata\": node_metadata,\n",
464	+	" \"tax_haven\": tax_haven, \n",
465	+	" \"jur\": jur,\n",
466	+	" \"politician\": pep, \n",
467	+	" \"politician_metadata\": pepm, \n",
468	+	" \"rus_politician\": r,\n",
469	+	" \"rus_politician_metadata\": rm, \n",
470	+	"})\n",
471	+	"edges_df = pd.concat(dfs)"
472	+	]
473	+	},
474	+	{
475	+	"cell_type": "code",
476	+	"execution_count": 38,
477	+	"id": "3d6af603-5fa9-4eb9-8ce0-52a75ff27a44",
478	+	"metadata": {},
479	+	"outputs": [
480	+	{
481	+	"data": {
482	+	"text/plain": [
483	+	"((81482446, 5), (62008292, 12), (1000000, 11))"
484	+	]
485	+	},
486	+	"execution_count": 38,
487	+	"metadata": {},
488	+	"output_type": "execute_result"
489	+	}
490	+	],
491	+	"source": [
492	+	"edges_df.shape, nodes_df.shape, subnetwork_df.shape"
493	+	]
494	+	},
495	+	{
496	+	"cell_type": "code",
497	+	"execution_count": 35,
498	+	"id": "6164657c-f196-43b2-9c14-bc8bdce811fd",
499	+	"metadata": {},
500	+	"outputs": [],
501	+	"source": [
502	+	"subnetwork_df.to_parquet(schema['output_nodes'])\n",
503	+	"edges_df.to_parquet(schema['output_nodes'], partition_cols = [\"subgraph_partition\"])\n",
504	+	"nodes_df['subgraph_partition'] = nodes_df.subgraph_hash.apply(lambda x: x%1000)\n",
505	+	"nodes_df.to_parquet(schema['output_nodes'], partition_cols = [\"subgraph_partition\"])"
506	+	]
507	+	},
508	+	{
509	+	"cell_type": "code",
510	+	"execution_count": null,
511	+	"id": "5a26567c-2a19-484b-98bb-1bb4ccc154a9",
512	+	"metadata": {},
513	+	"outputs": [],
514	+	"source": []
515	+	}
516	+	],
517	+	"metadata": {
518	+	"iooxa": {
519	+	"id": {
520	+	"block": "VE4MftXdmZ856rMTec6u",
521	+	"project": "cNRcIFOMOBAHj5O57Joq",
522	+	"version": 1
523	+	}
524	+	},
525	+	"kernelspec": {
526	+	"display_name": "Python 3",
527	+	"language": "python",
528	+	"name": "python3"
529	+	},
530	+	"language_info": {
531	+	"codemirror_mode": {
532	+	"name": "ipython",
533	+	"version": 3
534	+	},
535	+	"file_extension": ".py",
536	+	"mimetype": "text/x-python",
537	+	"name": "python",
538	+	"nbconvert_exporter": "python",
539	+	"pygments_lexer": "ipython3",
540	+	"version": "3.8.7"
541	+	}
542	+	},
543	+	"nbformat": 4,
544	+	"nbformat_minor": 5
545	+	}
546	+

■ ■ ■ ■ ■ ■

data_cache/DATA_SCHEMA_README.md

1	+	# 451 Corporate Risk Miner Datasets used:
2	+
3	+	## Datasets/Data Sources Used:
4	+
5	+	UKCH Company House Datasets
6	+
7	+	- UKCH Company Dataset, downloaded from: http://download.companieshouse.gov.uk/en_output.html
8	+	- UKCH PSC Company Dataset, downloaded from: http://download.companieshouse.gov.uk/en_pscdata.html
9	+	- UKCH Officers Dataset, scraped from: https://developer-specs.company-information.service.gov.uk/companies-house-public-data-api/reference/officers/list
10	+	- Every politician, downloaded from: https://everypolitician.org/countries.html
11	+	- Russian peps, downloaded from: https://rupep.org/ru/persons_list/
12	+
13	+	## Input Schema:
14	+	Every entry in every dataset, gets assigned a unique `mention_id` that is a concatenation of a dataset and a row.
15	+
16	+	### UK CH Company Dataset is consumed as company_df with following columns:
17	+	#### `company_df`
18	+	- company_number
19	+	- company_name
20	+	- country
21	+	- industry_code
22	+	- address
23	+
24	+	### UK CH PSC Company Dataset is consumed by splitting people and company owners apart:
25	+
26	+	#### `psc_company_df`:
27	+	- name
28	+	- combined_address
29	+	- kind
30	+	- company_number
31	+	- psc_derived_company_number (this is the company_house id of the owning entity)
32	+
33	+	#### `psc_company_df`:
34	+	- name
35	+	- name_elements_middle_name
36	+	- name_elements_forename
37	+	- name_elements_surname
38	+	- nationality
39	+	- address_postal_code
40	+	- date_of_birth_year
41	+	- date_of_birth_month
42	+	- company_number
43	+
44	+	### UK CH Company Officers:
45	+	#### `officer_df`:
46	+	- forenames
47	+	- surname
48	+	- nationality
49	+	- yob
50	+	- mob
51	+	- appointment_role
52	+	- post_town
53	+	- postal_code
54	+	- country
55	+

■ ■ ■ ■ ■ ■

data_cache/build_data_cache.ipynb

1	+	{
2	+	"cells": [
3	+	{
4	+	"cell_type": "markdown",
5	+	"id": "9b424ba0-9394-4411-a475-cffe1d9c7fce",
6	+	"metadata": {},
7	+	"source": [
8	+	"## Create Data Cache:"
9	+	]
10	+	},
11	+	{
12	+	"cell_type": "code",
13	+	"execution_count": 5,
14	+	"id": "d77b1094-5c00-4c46-8065-0e9892e15028",
15	+	"metadata": {
16	+	"iooxa": {
17	+	"id": {
18	+	"block": "FrkUPeD1VXG9V8zSSfh8",
19	+	"project": "cNRcIFOMOBAHj5O57Joq",
20	+	"version": 1
21	+	},
22	+	"outputId": {
23	+	"block": "zlAgkbC1M5B0QnHjKdn0",
24	+	"project": "cNRcIFOMOBAHj5O57Joq",
25	+	"version": 1
26	+	}
27	+	}
28	+	},
29	+	"outputs": [
30	+	{
31	+	"name": "stderr",
32	+	"output_type": "stream",
33	+	"text": [
34	+	"UKCH Company er map: 100%\|██████████\| 5107631/5107631 [00:05<00:00, 953235.19it/s] \n",
35	+	"Officer (person) er map: 100%\|██████████\| 10035057/10035057 [00:23<00:00, 422416.04it/s]\n",
36	+	"Officer (company) er map: 100%\|██████████\| 313158/313158 [00:00<00:00, 756575.34it/s]\n",
37	+	"PSC (company) er map: 100%\|██████████\| 702472/702472 [00:00<00:00, 859221.68it/s]\n",
38	+	"PSC (person) er map: 100%\|██████████\| 9012596/9012596 [00:25<00:00, 355636.44it/s]"
39	+	]
40	+	},
41	+	{
42	+	"name": "stdout",
43	+	"output_type": "stream",
44	+	"text": [
45	+	"CPU times: user 2min 33s, sys: 48.9 s, total: 3min 21s\n",
46	+	"Wall time: 4min 29s\n"
47	+	]
48	+	},
49	+	{
50	+	"name": "stderr",
51	+	"output_type": "stream",
52	+	"text": [
53	+	"\n"
54	+	]
55	+	}
56	+	],
57	+	"source": [
58	+	"%%time\n",
59	+	"from data_cache.utils import ProduceEntityResolution\n",
60	+	"from data_cache.schema import schema\n",
61	+	"\n",
62	+	"from unidecode import unidecode\n",
63	+	"import pandas as pd\n",
64	+	"\n",
65	+	"# For more info on sources, see data_cache/DATA_SCHEMA_README:\n",
66	+	"company_df = pd.read_parquet(schema['ukch_companies'])\n",
67	+	"officer_df = pd.read_parquet(schema['ukch_officers'])\n",
68	+	"psc_company_df = pd.read_parquet(schema['psc_company'])\n",
69	+	"psc_person_df = pd.read_parquet(schema['psc_person'])\n",
70	+	"\n",
71	+	"all_politicians = pd.read_csv(schema['politicians_parsed'])\n",
72	+	"all_politicians = all_politicians.where(pd.notnull(all_politicians), None)\n",
73	+	"\n",
74	+	"ru_bl_peps = pd.read_csv(schema['ru_bl_peps_parsed'])\n",
75	+	"ru_bl_peps = ru_bl_peps.where(pd.notnull(ru_bl_peps), None)\n",
76	+	"\n",
77	+	"un_sanctions = pd.read_csv(schema['un_parsed'])\n",
78	+	"un_sanctions = un_sanctions.where(pd.notnull(un_sanctions), None)\n",
79	+	"\n",
80	+	"per = ProduceEntityResolution(company_df)\n",
81	+	"per.resolve_entities(company_df, officer_df, psc_company_df, psc_person_df)"
82	+	]
83	+	},
84	+	{
85	+	"cell_type": "markdown",
86	+	"id": "eb76af3a-8a7b-4864-b034-f30ba8d69e4e",
87	+	"metadata": {
88	+	"iooxa": {
89	+	"id": {
90	+	"block": "8Ky77gWt42j1Xau0trNJ",
91	+	"project": "cNRcIFOMOBAHj5O57Joq",
92	+	"version": 1
93	+	}
94	+	}
95	+	},
96	+	"source": [
97	+	"## Graph Building and Breaking down into subnetworks:"
98	+	]
99	+	},
100	+	{
101	+	"cell_type": "code",
102	+	"execution_count": 6,
103	+	"id": "af6a9b94-1c68-4fa1-959d-db497d796de3",
104	+	"metadata": {
105	+	"iooxa": {
106	+	"id": {
107	+	"block": "Z2iDuhiAMrSLfhPmW232",
108	+	"project": "cNRcIFOMOBAHj5O57Joq",
109	+	"version": 1
110	+	},
111	+	"outputId": {
112	+	"block": "as1z2tZl1DsTqxwV8anY",
113	+	"project": "cNRcIFOMOBAHj5O57Joq",
114	+	"version": 1
115	+	}
116	+	}
117	+	},
118	+	"outputs": [
119	+	{
120	+	"name": "stderr",
121	+	"output_type": "stream",
122	+	"text": [
123	+	"PSC company graph: 702472it [00:03, 180621.51it/s]\n",
124	+	"PSC person graph: 9012596it [01:01, 145691.31it/s]\n",
125	+	"Officer graph: 10348215it [00:42, 241512.75it/s]\n"
126	+	]
127	+	},
128	+	{
129	+	"name": "stdout",
130	+	"output_type": "stream",
131	+	"text": [
132	+	"Top 10 Connected component sizes: [4373053, 1630, 1313, 760, 753, 512, 430, 409, 355, 346]\n"
133	+	]
134	+	},
135	+	{
136	+	"name": "stderr",
137	+	"output_type": "stream",
138	+	"text": [
139	+	"Breaking down Giant CC (size 4373053): 100%\|██████████\| 4373053/4373053 [05:18<00:00, 13723.90it/s]\n"
140	+	]
141	+	},
142	+	{
143	+	"name": "stdout",
144	+	"output_type": "stream",
145	+	"text": [
146	+	"Giant Component of size 4373053 was broken down.\n",
147	+	" Added 3139321 neighbourhoods, \n",
148	+	" Sum of all nodes = 217725726\n",
149	+	" Overhead ratio=49.78803732769761\n",
150	+	"\n"
151	+	]
152	+	},
153	+	{
154	+	"name": "stderr",
155	+	"output_type": "stream",
156	+	"text": [
157	+	"Breaking down Giant CC (size 1630): 100%\|██████████\| 1630/1630 [00:00<00:00, 8143.89it/s]\n"
158	+	]
159	+	},
160	+	{
161	+	"name": "stdout",
162	+	"output_type": "stream",
163	+	"text": [
164	+	"Giant Component of size 1630 was broken down.\n",
165	+	" Added 1443 neighbourhoods, \n",
166	+	" Sum of all nodes = 330985\n",
167	+	" Overhead ratio=203.05828220858896\n",
168	+	"\n"
169	+	]
170	+	},
171	+	{
172	+	"name": "stderr",
173	+	"output_type": "stream",
174	+	"text": [
175	+	"Breaking down Giant CC (size 1313): 100%\|██████████\| 1313/1313 [00:00<00:00, 2181.59it/s]\n"
176	+	]
177	+	},
178	+	{
179	+	"name": "stdout",
180	+	"output_type": "stream",
181	+	"text": [
182	+	"Giant Component of size 1313 was broken down.\n",
183	+	" Added 84 neighbourhoods, \n",
184	+	" Sum of all nodes = 16293\n",
185	+	" Overhead ratio=12.408987052551408\n",
186	+	"\n",
187	+	"CPU times: user 7min 51s, sys: 14.1 s, total: 8min 5s\n",
188	+	"Wall time: 8min 3s\n"
189	+	]
190	+	}
191	+	],
192	+	"source": [
193	+	"%%time\n",
194	+	"from data_cache.utils import GraphBuilder\n",
195	+	"gb = GraphBuilder()\n",
196	+	"gb.build(per, psc_company_df, psc_person_df, officer_df)\n",
197	+	"gb.break_into_subgraphs(1000)"
198	+	]
199	+	},
200	+	{
201	+	"cell_type": "markdown",
202	+	"id": "17fb8fcd-feb8-4ee3-b561-804813d86dd0",
203	+	"metadata": {},
204	+	"source": [
205	+	"### Add node describtors for risk calculation later:"
206	+	]
207	+	},
208	+	{
209	+	"cell_type": "code",
210	+	"execution_count": 7,
211	+	"id": "503789de-af85-466b-890d-c379e8da8c4e",
212	+	"metadata": {
213	+	"iooxa": {
214	+	"id": {
215	+	"block": "Dag1oRYbibJPCEAO30P2",
216	+	"project": "cNRcIFOMOBAHj5O57Joq",
217	+	"version": 1
218	+	},
219	+	"outputId": {
220	+	"block": "jzcbgB4EZdoFec0VdJhm",
221	+	"project": "cNRcIFOMOBAHj5O57Joq",
222	+	"version": 1
223	+	}
224	+	}
225	+	},
226	+	"outputs": [
227	+	{
228	+	"name": "stderr",
229	+	"output_type": "stream",
230	+	"text": [
231	+	"company: 100%\|██████████\| 5107631/5107631 [00:46<00:00, 110139.75it/s]\n",
232	+	"officer_person: 100%\|██████████\| 10035057/10035057 [00:29<00:00, 342760.19it/s]\n",
233	+	"officer_company: 100%\|██████████\| 313158/313158 [00:00<00:00, 626877.37it/s]\n",
234	+	"psc_person: 100%\|██████████\| 9012596/9012596 [00:22<00:00, 397809.45it/s]\n",
235	+	"psc_company: 100%\|██████████\| 702472/702472 [00:01<00:00, 491691.74it/s]\n"
236	+	]
237	+	},
238	+	{
239	+	"name": "stdout",
240	+	"output_type": "stream",
241	+	"text": [
242	+	"CPU times: user 1min 51s, sys: 5.37 s, total: 1min 57s\n",
243	+	"Wall time: 1min 56s\n"
244	+	]
245	+	}
246	+	],
247	+	"source": [
248	+	"%%time\n",
249	+	"%load_ext autoreload\n",
250	+	"%autoreload 2\n",
251	+	"from data_cache.utils import NodeDescriber\n",
252	+	"\n",
253	+	"nd = NodeDescriber(per)\n",
254	+	"nd.add_metadata(company_df, officer_df, psc_company_df, psc_person_df)"
255	+	]
256	+	},
257	+	{
258	+	"cell_type": "markdown",
259	+	"id": "76448310-b857-459d-b6e1-6185dc908d9a",
260	+	"metadata": {},
261	+	"source": [
262	+	"### Find if entities have possible matches against Politicians datasets"
263	+	]
264	+	},
265	+	{
266	+	"cell_type": "code",
267	+	"execution_count": null,
268	+	"id": "7f4c8507-2805-4706-aaf8-07899b177da9",
269	+	"metadata": {},
270	+	"outputs": [],
271	+	"source": [
272	+	"PEP, RUS = {}, {}\n",
273	+	"for name, dob, country in zip(all_politicians.NAME, all_politicians.DOB, all_politicians.COUNTRY):\n",
274	+	" name = unidecode(name).lower()\n",
275	+	" PEP[name] = {\"country\": country, \"source\": \"every_politician\"}\n",
276	+	" if isinstance(dob, str):\n",
277	+	" PEP[name][\"yob\"] = int(dob[:4])\n",
278	+	" if len(dob) == 10:\n",
279	+	" PEP[name][\"mob\"] = int(dob[5:7])\n",
280	+	"for name, dob, cat, tx in zip(ru_bl_peps.NAME_EN, ru_bl_peps.DOB, ru_bl_peps.CATEGORY, ru_bl_peps.TAXPAYER_NUM):\n",
281	+	" name = unidecode(name).lower()\n",
282	+	" RUS[name] = {\"country\": \"RU/BY\", \"category\": cat, \"taxpayer_num\": tx, \"source\": \"rupep.org\"}\n",
283	+	" if isinstance(dob, str) and len(dob) == 10:\n",
284	+	" RUS[name][\"yob\"] = int(dob[-4:])\n",
285	+	" RUS[name][\"mob\"] = int(dob[3:5])"
286	+	]
287	+	},
288	+	{
289	+	"cell_type": "markdown",
290	+	"id": "276470f1-084f-44c7-bc30-fa3a6905c170",
291	+	"metadata": {
292	+	"iooxa": {
293	+	"id": {
294	+	"block": "27gIHsImOPFu9FiC5HJe",
295	+	"project": "cNRcIFOMOBAHj5O57Joq",
296	+	"version": 1
297	+	}
298	+	}
299	+	},
300	+	"source": [
301	+	"## Build subnetwork stats:"
302	+	]
303	+	},
304	+	{
305	+	"cell_type": "code",
306	+	"execution_count": 12,
307	+	"id": "b78891af-f60f-415f-8fc4-dbd0ab0487f6",
308	+	"metadata": {
309	+	"iooxa": {
310	+	"id": {
311	+	"block": "EETu5XmgZCjgCT2mtRmt",
312	+	"project": "cNRcIFOMOBAHj5O57Joq",
313	+	"version": 1
314	+	},
315	+	"outputId": {
316	+	"block": "zkzN2j3pJpxUBmPjXebN",
317	+	"project": "cNRcIFOMOBAHj5O57Joq",
318	+	"version": 1
319	+	}
320	+	},
321	+	"tags": []
322	+	},
323	+	"outputs": [
324	+	{
325	+	"name": "stderr",
326	+	"output_type": "stream",
327	+	"text": [
328	+	"Precomputing risk signals: 100%\|██████████\| 1000000/1000000 [50:37<00:00, 329.22it/s] \n"
329	+	]
330	+	}
331	+	],
332	+	"source": [
333	+	"from tqdm import tqdm \n",
334	+	"import numpy as np\n",
335	+	"from utils import TAX_HEAVENS\n",
336	+	"\n",
337	+	"# How many networks to cache. UKCH Total in 2022 is about 7M.\n",
338	+	"N = 1_000_000\n",
339	+	"PROXY_TH = 50\n",
340	+	"PARTITION_SIZE = 1000\n",
341	+	"\n",
342	+	"subnetwork_ids = list(gb.hash_to_subn_map.keys())[:N]\n",
343	+	"\n",
344	+	"def count_company_ratio(ns):\n",
345	+	" return np.mean([not n.startswith(\"p\|\") for n in ns])\n",
346	+	"\n",
347	+	"def calculate_cyclicity(H):\n",
348	+	" if H.number_of_nodes() < 1:\n",
349	+	" print(f\"Non existent network: {netws}\")\n",
350	+	" return 0\n",
351	+	" E = H.number_of_edges()\n",
352	+	" N = H.number_of_nodes()\n",
353	+	" return (E + 1 - N)/(N*np.log(N))\n",
354	+	"\n",
355	+	"def netw_names(names):\n",
356	+	" names = set(names) - {None}\n",
357	+	" return \", \".join(sorted(names))\n",
358	+	"\n",
359	+	"def metadata_converter(md):\n",
360	+	" return \"; \".join([f\"{k}: {v}\" for k, v in md.items()])\n",
361	+	"\n",
362	+	"clc, node_num, dfs, company_ratio, entity_names, multi_jurisdiction, jur_names, netw_tax_haven = [], [], [], [], [], [], [], []\n",
363	+	"nodes, proxy, is_person, tax_haven, jur, node_metadata, netws = [], [], [], [], [], [], []\n",
364	+	"pep, pepm, r, rm, netw_pep, netw_r = [], [], [], [], [], []\n",
365	+	"for _id in tqdm(subnetwork_ids[:N], desc= \"Precomputing risk signals\"):\n",
366	+	" \n",
367	+	" # Get networkx subgraph:\n",
368	+	" nw = gb.hash_to_subn_map[_id]\n",
369	+	" H = gb.G_undir.subgraph(nw)\n",
370	+	" \n",
371	+	" # Network:\n",
372	+	" clc.append(calculate_cyclicity(H))\n",
373	+	" node_num.append(len(nw))\n",
374	+	" company_ratio.append(count_company_ratio(nw))\n",
375	+	" \n",
376	+	" # Edges:\n",
377	+	" df = pd.DataFrame(H.edges.data(\"edge_type\"), columns =['source', 'target', 'type'])\n",
378	+	" df['subgraph_hash'] =_id\n",
379	+	" df['subgraph_partition'] =_id % PARTITION_SIZE\n",
380	+	" dfs.append(df)\n",
381	+	" \n",
382	+	" # Nodes:\n",
383	+	" ns = gb.hash_to_subn_map[_id]\n",
384	+	" netw_jurs, netw_entity_names = [], []\n",
385	+	" netw_pep_value, netw_r_value = 0, 0\n",
386	+	" for n in ns:\n",
387	+	" nodes.append(n)\n",
388	+	" netws.append(_id)\n",
389	+	" proxy.append(int(gb.G_undir.degree[n] > PROXY_TH))\n",
390	+	" is_person.append(int(n.startswith(\"p\|\")))\n",
391	+	" node_metadata.append(nd.node_to_metadata.get(n, None))\n",
392	+	" \n",
393	+	" j = nd.node_to_jurs.get(n, set())\n",
394	+	" jur.append(\", \".join(sorted(j)))\n",
395	+	" tax_haven.append(int(len(j.intersection(TAX_HEAVENS)) > 0))\n",
396	+	" \n",
397	+	" netw_jurs.append(nd.node_to_jurs.get(n, None))\n",
398	+	" \n",
399	+	" name = nd.node_to_names.get(n, None)\n",
400	+	" netw_entity_names.append(name)\n",
401	+	" \n",
402	+	" if name is not None and name in RUS:\n",
403	+	" r.append(1)\n",
404	+	" rm.append(metadata_converter(RUS[name]))\n",
405	+	" netw_r_value += 1\n",
406	+	" else:\n",
407	+	" r.append(0)\n",
408	+	" rm.append(\"\")\n",
409	+	" \n",
410	+	" if name is not None and name in PEP:\n",
411	+	" pep.append(1)\n",
412	+	" pepm.append(metadata_converter(PEP[name]))\n",
413	+	" netw_pep_value += 1\n",
414	+	" else:\n",
415	+	" pep.append(0)\n",
416	+	" pepm.append(\"\")\n",
417	+	"\n",
418	+	" netw_pep.append(netw_pep_value)\n",
419	+	" netw_r.append(netw_r_value)\n",
420	+	" jurs_in_subnetwork = list(set().union(*[n for n in netw_jurs if n is not None]))\n",
421	+	" jur_names.append(\"; \".join(jurs_in_subnetwork))\n",
422	+	" netw_tax_haven.append(int(len(set(jurs_in_subnetwork).intersection(TAX_HEAVENS))> 0))\n",
423	+	" multi_jurisdiction.append(int(len(jurs_in_subnetwork) > 1))\n",
424	+	" entity_names.append(netw_names(netw_entity_names))\n"
425	+	]
426	+	},
427	+	{
428	+	"cell_type": "code",
429	+	"execution_count": 17,
430	+	"id": "efa05093-8e03-44d0-922f-e8e37302bc40",
431	+	"metadata": {},
432	+	"outputs": [
433	+	{
434	+	"name": "stdout",
435	+	"output_type": "stream",
436	+	"text": [
437	+	"CPU times: user 4min, sys: 12.2 s, total: 4min 12s\n",
438	+	"Wall time: 4min 11s\n"
439	+	]
440	+	}
441	+	],
442	+	"source": [
443	+	"%%time\n",
444	+	"PROXY_NETW_ID = set(nodes_df[nodes_df.proxy_dir==1].subgraph_hash)\n",
445	+	"subnetwork_df[\"proxy\"] = [int(s in PROXY_NETW_ID) for s in subnetwork_df.network_id.tolist()]\n",
446	+	"subnetwork_df = pd.DataFrame(data = {\n",
447	+	" \"network_id\": subnetwork_ids, \n",
448	+	" \"cyclicity\": clc, \n",
449	+	" \"node_num\": node_num, \n",
450	+	" \"company_ratio\": company_ratio,\n",
451	+	" \"multi_jurisdiction\": multi_jurisdiction, \n",
452	+	" \"tax_haven\": netw_tax_haven, \n",
453	+	" \"potential_pep_match\": netw_pep, \n",
454	+	" \"potential_rus_pep_match\": netw_r, \n",
455	+	" \"entity_names\": entity_names, \n",
456	+	" \"jur_names\": jur_names, \n",
457	+	"})\n",
458	+	"nodes_df = pd.DataFrame(data = {\n",
459	+	" 'node_id': nodes, \n",
460	+	" 'subgraph_hash': netws, \n",
461	+	" \"is_person\": is_person,\n",
462	+	" \"proxy_dir\": proxy, \n",
463	+	" \"node_metadata\": node_metadata,\n",
464	+	" \"tax_haven\": tax_haven, \n",
465	+	" \"jur\": jur,\n",
466	+	" \"politician\": pep, \n",
467	+	" \"politician_metadata\": pepm, \n",
468	+	" \"rus_politician\": r,\n",
469	+	" \"rus_politician_metadata\": rm, \n",
470	+	"})\n",
471	+	"edges_df = pd.concat(dfs)"
472	+	]
473	+	},
474	+	{
475	+	"cell_type": "code",
476	+	"execution_count": 38,
477	+	"id": "3d6af603-5fa9-4eb9-8ce0-52a75ff27a44",
478	+	"metadata": {},
479	+	"outputs": [
480	+	{
481	+	"data": {
482	+	"text/plain": [
483	+	"((81482446, 5), (62008292, 12), (1000000, 11))"
484	+	]
485	+	},
486	+	"execution_count": 38,
487	+	"metadata": {},
488	+	"output_type": "execute_result"
489	+	}
490	+	],
491	+	"source": [
492	+	"edges_df.shape, nodes_df.shape, subnetwork_df.shape"
493	+	]
494	+	},
495	+	{
496	+	"cell_type": "code",
497	+	"execution_count": 35,
498	+	"id": "6164657c-f196-43b2-9c14-bc8bdce811fd",
499	+	"metadata": {},
500	+	"outputs": [],
501	+	"source": [
502	+	"subnetwork_df.to_parquet(schema['output_nodes'])\n",
503	+	"edges_df.to_parquet(schema['output_nodes'], partition_cols = [\"subgraph_partition\"])\n",
504	+	"nodes_df['subgraph_partition'] = nodes_df.subgraph_hash.apply(lambda x: x%1000)\n",
505	+	"nodes_df.to_parquet(schema['output_nodes'], partition_cols = [\"subgraph_partition\"])"
506	+	]
507	+	},
508	+	{
509	+	"cell_type": "code",
510	+	"execution_count": null,
511	+	"id": "5a26567c-2a19-484b-98bb-1bb4ccc154a9",
512	+	"metadata": {},
513	+	"outputs": [],
514	+	"source": []
515	+	}
516	+	],
517	+	"metadata": {
518	+	"iooxa": {
519	+	"id": {
520	+	"block": "VE4MftXdmZ856rMTec6u",
521	+	"project": "cNRcIFOMOBAHj5O57Joq",
522	+	"version": 1
523	+	}
524	+	},
525	+	"kernelspec": {
526	+	"display_name": "Python 3",
527	+	"language": "python",
528	+	"name": "python3"
529	+	},
530	+	"language_info": {
531	+	"codemirror_mode": {
532	+	"name": "ipython",
533	+	"version": 3
534	+	},
535	+	"file_extension": ".py",
536	+	"mimetype": "text/x-python",
537	+	"name": "python",
538	+	"nbconvert_exporter": "python",
539	+	"pygments_lexer": "ipython3",
540	+	"version": "3.8.7"
541	+	}
542	+	},
543	+	"nbformat": 4,
544	+	"nbformat_minor": 5
545	+	}
546	+

■ ■ ■ ■ ■ ■

data_cache/data_schema.py

1	+	schema = {
2	+	"every_politician": "/sanctions_and_peps/parsed/politicians_parsed.csv",
3	+	"russian_politician": "/sanctions_and_peps/parsed/ru_bl_peps_parsed.csv",
4	+	# Source: UKCH Company house, see schema readme for more details:
5	+	"ukch_company": "/insert_download_folder/ukch_companies.parquet",
6	+	"ukch_officers": "/insert_download_folder/ukch_officers.parquet",
7	+	"ukch_psc_company": "/insert_download_folder/psc_company.parquet",
8	+	"ukch_psc_people": "/insert_download_folder/psc_person.parquet",
9	+	# Output locations for stream lit to consume:
10	+	"output_nodes": "/data/networks.parquet",
11	+	"output_edges": "/data/edges.parquet",
12	+	"output_networks": "/data/networks.parquet"
13	+	}

■ ■ ■ ■ ■ ■

data_cache/utils.py

1	+	import random
2	+	from tqdm import tqdm
3	+	import networkx as nx
4	+	from collections import defaultdict
5	+
6	+
7	+	TAX_HEAVENS = {
8	+	'jersey',
9	+	'luxembourg',
10	+	'virgin islands, british',
11	+	'cayman islands',
12	+	'switzerland',
13	+	'hong kong',
14	+	'singapore',
15	+	'guernsey',
16	+	'bermuda',
17	+	'seychelles',
18	+	'united arab emirates',
19	+	'marshall islands',
20	+	'belize',
21	+	"isle of man",
22	+	"barbados",
23	+	"panama",
24	+	"bahamas",
25	+	"saint lucia",
26	+	"dominica",
27	+	}
28	+
29	+
30	+	class ProduceEntityResolution:
31	+	def __init__(self, company_df, max_rand=10 ** 8, min_len=7):
32	+	self.company_name_to_reg = {
33	+	n.upper().strip(): r for n, r in zip(company_df.company_name, company_df.company_number)
34	+	}
35	+	self.max_rand = max_rand
36	+	self.er_map = {}
37	+	self.min_len = min_len
38	+
39	+	def missing_entry_randomisation(self):
40	+	return random.randint(0, self.max_rand)
41	+
42	+	def get_person_identifier(self, forenames, surname, yob, mob):
43	+	if not surname:
44	+	return f"p\|rand_{self.missing_entry_randomisation()}"
45	+	if forenames is None or len(forenames) == 0:
46	+	return f"p\|rand_{self.missing_entry_randomisation()}"
47	+	elif isinstance(forenames, list):
48	+	name = " ".join(forenames).lower()
49	+	else:
50	+	name = forenames.lower()
51	+	return f"p\|{name} {surname.lower()}\|{yob}\|{mob}"
52	+
53	+	def get_company_identifier(self, cname, cnum):
54	+	if cnum and len(cnum) >= self.min_len:
55	+	return f"c\|{cnum}"
56	+	if cname in self.company_name_to_reg:
57	+	return f"c\|{self.company_name_to_reg[cname]}"
58	+	if len(cname) > self.min_len:
59	+	return f"c\|company_named_{cname}"
60	+	return f"c\|rand_{self.missing_entry_randomisation()}"
61	+
62	+	def resolve_entities(self, company_df, officer_df, psc_company_df, psc_person_df):
63	+
64	+	# Company ER
65	+	inps = zip(company_df.mention_id, company_df.company_name, company_df.company_number)
66	+	for mid, cname, cnum in tqdm(inps, desc="UKCH Company er map", total=company_df.shape[0]):
67	+	self.er_map[mid] = self.get_company_identifier(cname, cnum)
68	+
69	+	off_person_df = officer_df[~officer_df.is_corporate_body]
70	+	inps = zip(
71	+	off_person_df.mention_id,
72	+	off_person_df.forenames,
73	+	off_person_df.surname,
74	+	off_person_df.yob,
75	+	off_person_df.mob,
76	+	)
77	+	for mid, fs, cs, yob, mob in tqdm(inps, desc="Officer (person) er map", total=off_person_df.shape[0]):
78	+	self.er_map[mid] = self.get_person_identifier(fs, cs, yob, mob)
79	+
80	+	off_company_df = officer_df[officer_df.is_corporate_body]
81	+	inps = zip(off_company_df.mention_id, off_company_df.surname)
82	+	for mid, cname in tqdm(inps, desc="Officer (company) er map", total=off_company_df.shape[0]):
83	+	self.er_map[mid] = self.get_company_identifier(cname, None)
84	+
85	+	# PSC Company ER
86	+	inps = zip(psc_company_df.mention_id, psc_company_df.name, psc_company_df.psc_derived_company_number)
87	+	for mid, cname, cnum in tqdm(inps, desc="PSC (company) er map", total=psc_company_df.shape[0]):
88	+	self.er_map[mid] = self.get_company_identifier(cname, cnum)
89	+
90	+	# People of significant control [physical people] ER:
91	+	inps = zip(
92	+	psc_person_df.mention_id,
93	+	psc_person_df.name_elements_forename,
94	+	psc_person_df.name_elements_middle_name,
95	+	psc_person_df.name_elements_surname,
96	+	psc_person_df.date_of_birth_year,
97	+	psc_person_df.date_of_birth_month,
98	+	)
99	+	for mid, cf, cm, cs, yob, mob in tqdm(inps, desc="PSC (person) er map", total=psc_person_df.shape[0]):
100	+	forenames = [x for x in [cf, cm] if x is not None]
101	+	self.er_map[mid] = self.get_person_identifier(forenames, cs, yob, mob)
102	+
103	+
104	+	class GraphBuilder:
105	+	def __init__(self):
106	+	self.G = nx.DiGraph()
107	+	self.G_undir = nx.Graph()
108	+	self.CCs = []
109	+	self.ccs_len = []
110	+	self.hash_to_subn_map = {}
111	+
112	+	def build_G(self, per, psc_company_df, psc_person_df, officer_df):
113	+	for target_eid, source_mid in tqdm(zip(psc_company_df.company_number, psc_company_df.mention_id),
114	+	desc="PSC company graph"):
115	+	self.G_undir.add_edge(per.er_map[source_mid], f"c\|{target_eid}", edge_type="own")
116	+
117	+	for target_eid, source_mid in tqdm(zip(psc_person_df.company_number, psc_person_df.mention_id),
118	+	desc="PSC person graph"):
119	+	self.G_undir.add_edge(per.er_map[source_mid], f"c\|{target_eid}", edge_type="own")
120	+
121	+	for target_eid, source_mid in tqdm(zip(officer_df.company_number, officer_df.mention_id), desc="Officer graph"):
122	+	self.G_undir.add_edge(per.er_map[source_mid], f"c\|{target_eid}", edge_type="control")
123	+
124	+	def get_neigh(self, nodes, radius=2):
125	+	nodes = set([n for n in nodes if n in self.G_undir])
126	+	for node in list(nodes):
127	+	neighbors = set(self.G_undir.neighbors(node))
128	+	neighbors = set([x for x in neighbors if isinstance(x, str)])
129	+	nodes = nodes.union(neighbors)
130	+	return nodes if radius == 1 else self.get_neigh(nodes, radius - 1)
131	+
132	+	def build(self, per, psc_company_df, psc_person_df, officer_df):
133	+	self.build_G(per, psc_company_df, psc_person_df, officer_df)
134	+	self.CCs = sorted(nx.connected_components(self.G_undir), key=len, reverse=True)
135	+	self.ccs_len = [len(c) for c in self.CCs]
136	+	print(f"Top 10 Connected component sizes: {self.ccs_len[:10]}")
137	+
138	+	@staticmethod
139	+	def hash_subnetwork(nodes: iter):
140	+	return sum(hash(n) % 2 ** 32 for n in nodes)
141	+
142	+	def break_into_subgraphs(self, max_size=1000):
143	+	for cc in self.CCs:
144	+	# Small connected components go in as they are:
145	+	if len(cc) < max_size:
146	+	subnetwork_hash = self.hash_subnetwork(cc)
147	+	self.hash_to_subn_map[subnetwork_hash] = cc
148	+	# Giant component gets added by unique neighbourhoods
149	+	else:
150	+	size_before = len(self.hash_to_subn_map)
151	+	extra_nodes = 0
152	+	for node in tqdm(cc, desc=f"Breaking down Giant CC (size {len(cc)})"):
153	+	subnetwork = self.get_neigh([node])
154	+	subnetwork_hash = self.hash_subnetwork(subnetwork)
155	+	if subnetwork_hash not in self.hash_to_subn_map:
156	+	self.hash_to_subn_map[subnetwork_hash] = subnetwork
157	+	extra_nodes += len(subnetwork)
158	+	print(f"""Giant Component of size {len(cc)} was broken down.
159	+	Added {len(self.hash_to_subn_map) - size_before} neighbourhoods,
160	+	Sum of all nodes = {extra_nodes}
161	+	Overhead ratio={extra_nodes / len(cc)}
162	+	""")
163	+
164	+
165	+	class NodeDescriber:
166	+	def __init__(self, per):
167	+	self.per = per
168	+	self.node_to_jurs = defaultdict(set)
169	+	self.node_to_names = {}
170	+	self.node_to_metadata = {}
171	+	self.metadata_cols = {
172	+	"company": ["mention_id", "name", "industry_code", "company_number", "country", "address"],
173	+	"psc_person": ["mention_id", "name", "nationality", "address_postal_code", "date_of_birth_year",
174	+	"date_of_birth_month"],
175	+	"psc_company": ["mention_id", "name", "psc_derived_company_number", "combined_address", "address_country",
176	+	"kind"],
177	+	"officer_person": ["mention_id", "forenames", "surname", "nationality", "yob", "mob"],
178	+	"officer_company": ["mention_id", "surname", "country", 'appointment_role', 'post_town', 'postal_code'],
179	+	}
180	+
181	+	def unify_jur(self, c):
182	+	if c is None:
183	+	return None
184	+	if c in ['england', 'wales', 'northern ireland', 'scotland', "uk"]:
185	+	return 'united kingdom'
186	+	if c in ["usa", "united states", "united states of america"]:
187	+	return "united states"
188	+	if c in ["british virgin islands", "virgin islands, british", "virgin islands"]:
189	+	return "virgin islands, british"
190	+	return c
191	+
192	+	def add_node_to_names(self):
193	+	for eid in set(self.per.er_map.values()):
194	+	if eid.startswith("p\|"):
195	+	self.node_to_names[eid] = eid.split("\|")[1]
196	+
197	+	def add_dataset_metadata(self, df, dataset_name):
198	+	c = self.metadata_cols[dataset_name]
199	+	inp = zip(df[c[0]], df[c[1]], df[c[2]], df[c[3]], df[c[4]], df[c[5]])
200	+	for tpl in tqdm(inp, desc=dataset_name, total=df.shape[0]):
201	+	eid = self.per.er_map[tpl[0]]
202	+	if eid not in self.node_to_metadata:
203	+	self.node_to_metadata[eid] = {}
204	+	for col_inx in range(1, len(c)):
205	+	col = self.metadata_cols[dataset_name][col_inx]
206	+	self.node_to_metadata[eid][col] = tpl[col_inx]
207	+	if col in ["country", "address_country", "reg_address_country"]:
208	+	if isinstance(tpl[col_inx], str):
209	+	unified_jur = self.unify_jur(tpl[col_inx].lower())
210	+	self.node_to_jurs[eid].add(unified_jur)
211	+
212	+	def add_metadata(self, company_df, officer_df, psc_company_df, psc_person_df):
213	+	self.add_dataset_metadata(company_df, "company")
214	+	self.add_dataset_metadata(officer_df[~officer_df.is_corporate_body], "officer_person")
215	+	self.add_dataset_metadata(officer_df[officer_df.is_corporate_body], "officer_company")
216	+	self.add_dataset_metadata(psc_person_df, "psc_person")
217	+	self.add_dataset_metadata(psc_company_df, "psc_company")
218	+	self.add_node_to_names()
219	+

data processing step