🤬
  • ■ ■ ■ ■ ■ ■
    data_cache/.ipynb_checkpoints/build_data_cache-checkpoint.ipynb
     1 +{
     2 + "cells": [
     3 + {
     4 + "cell_type": "markdown",
     5 + "id": "9b424ba0-9394-4411-a475-cffe1d9c7fce",
     6 + "metadata": {},
     7 + "source": [
     8 + "## Create Data Cache:"
     9 + ]
     10 + },
     11 + {
     12 + "cell_type": "code",
     13 + "execution_count": 5,
     14 + "id": "d77b1094-5c00-4c46-8065-0e9892e15028",
     15 + "metadata": {
     16 + "iooxa": {
     17 + "id": {
     18 + "block": "FrkUPeD1VXG9V8zSSfh8",
     19 + "project": "cNRcIFOMOBAHj5O57Joq",
     20 + "version": 1
     21 + },
     22 + "outputId": {
     23 + "block": "zlAgkbC1M5B0QnHjKdn0",
     24 + "project": "cNRcIFOMOBAHj5O57Joq",
     25 + "version": 1
     26 + }
     27 + }
     28 + },
     29 + "outputs": [
     30 + {
     31 + "name": "stderr",
     32 + "output_type": "stream",
     33 + "text": [
     34 + "UKCH Company er map: 100%|██████████| 5107631/5107631 [00:05<00:00, 953235.19it/s] \n",
     35 + "Officer (person) er map: 100%|██████████| 10035057/10035057 [00:23<00:00, 422416.04it/s]\n",
     36 + "Officer (company) er map: 100%|██████████| 313158/313158 [00:00<00:00, 756575.34it/s]\n",
     37 + "PSC (company) er map: 100%|██████████| 702472/702472 [00:00<00:00, 859221.68it/s]\n",
     38 + "PSC (person) er map: 100%|██████████| 9012596/9012596 [00:25<00:00, 355636.44it/s]"
     39 + ]
     40 + },
     41 + {
     42 + "name": "stdout",
     43 + "output_type": "stream",
     44 + "text": [
     45 + "CPU times: user 2min 33s, sys: 48.9 s, total: 3min 21s\n",
     46 + "Wall time: 4min 29s\n"
     47 + ]
     48 + },
     49 + {
     50 + "name": "stderr",
     51 + "output_type": "stream",
     52 + "text": [
     53 + "\n"
     54 + ]
     55 + }
     56 + ],
     57 + "source": [
     58 + "%%time\n",
     59 + "from data_cache.utils import ProduceEntityResolution\n",
     60 + "from data_cache.schema import schema\n",
     61 + "\n",
     62 + "from unidecode import unidecode\n",
     63 + "import pandas as pd\n",
     64 + "\n",
     65 + "# For more info on sources, see data_cache/DATA_SCHEMA_README:\n",
     66 + "company_df = pd.read_parquet(schema['ukch_companies'])\n",
     67 + "officer_df = pd.read_parquet(schema['ukch_officers'])\n",
     68 + "psc_company_df = pd.read_parquet(schema['psc_company'])\n",
     69 + "psc_person_df = pd.read_parquet(schema['psc_person'])\n",
     70 + "\n",
     71 + "all_politicians = pd.read_csv(schema['politicians_parsed'])\n",
     72 + "all_politicians = all_politicians.where(pd.notnull(all_politicians), None)\n",
     73 + "\n",
     74 + "ru_bl_peps = pd.read_csv(schema['ru_bl_peps_parsed'])\n",
     75 + "ru_bl_peps = ru_bl_peps.where(pd.notnull(ru_bl_peps), None)\n",
     76 + "\n",
     77 + "un_sanctions = pd.read_csv(schema['un_parsed'])\n",
     78 + "un_sanctions = un_sanctions.where(pd.notnull(un_sanctions), None)\n",
     79 + "\n",
     80 + "per = ProduceEntityResolution(company_df)\n",
     81 + "per.resolve_entities(company_df, officer_df, psc_company_df, psc_person_df)"
     82 + ]
     83 + },
     84 + {
     85 + "cell_type": "markdown",
     86 + "id": "eb76af3a-8a7b-4864-b034-f30ba8d69e4e",
     87 + "metadata": {
     88 + "iooxa": {
     89 + "id": {
     90 + "block": "8Ky77gWt42j1Xau0trNJ",
     91 + "project": "cNRcIFOMOBAHj5O57Joq",
     92 + "version": 1
     93 + }
     94 + }
     95 + },
     96 + "source": [
     97 + "## Graph Building and Breaking down into subnetworks:"
     98 + ]
     99 + },
     100 + {
     101 + "cell_type": "code",
     102 + "execution_count": 6,
     103 + "id": "af6a9b94-1c68-4fa1-959d-db497d796de3",
     104 + "metadata": {
     105 + "iooxa": {
     106 + "id": {
     107 + "block": "Z2iDuhiAMrSLfhPmW232",
     108 + "project": "cNRcIFOMOBAHj5O57Joq",
     109 + "version": 1
     110 + },
     111 + "outputId": {
     112 + "block": "as1z2tZl1DsTqxwV8anY",
     113 + "project": "cNRcIFOMOBAHj5O57Joq",
     114 + "version": 1
     115 + }
     116 + }
     117 + },
     118 + "outputs": [
     119 + {
     120 + "name": "stderr",
     121 + "output_type": "stream",
     122 + "text": [
     123 + "PSC company graph: 702472it [00:03, 180621.51it/s]\n",
     124 + "PSC person graph: 9012596it [01:01, 145691.31it/s]\n",
     125 + "Officer graph: 10348215it [00:42, 241512.75it/s]\n"
     126 + ]
     127 + },
     128 + {
     129 + "name": "stdout",
     130 + "output_type": "stream",
     131 + "text": [
     132 + "Top 10 Connected component sizes: [4373053, 1630, 1313, 760, 753, 512, 430, 409, 355, 346]\n"
     133 + ]
     134 + },
     135 + {
     136 + "name": "stderr",
     137 + "output_type": "stream",
     138 + "text": [
     139 + "Breaking down Giant CC (size 4373053): 100%|██████████| 4373053/4373053 [05:18<00:00, 13723.90it/s]\n"
     140 + ]
     141 + },
     142 + {
     143 + "name": "stdout",
     144 + "output_type": "stream",
     145 + "text": [
     146 + "Giant Component of size 4373053 was broken down.\n",
     147 + " Added 3139321 neighbourhoods, \n",
     148 + " Sum of all nodes = 217725726\n",
     149 + " Overhead ratio=49.78803732769761\n",
     150 + "\n"
     151 + ]
     152 + },
     153 + {
     154 + "name": "stderr",
     155 + "output_type": "stream",
     156 + "text": [
     157 + "Breaking down Giant CC (size 1630): 100%|██████████| 1630/1630 [00:00<00:00, 8143.89it/s]\n"
     158 + ]
     159 + },
     160 + {
     161 + "name": "stdout",
     162 + "output_type": "stream",
     163 + "text": [
     164 + "Giant Component of size 1630 was broken down.\n",
     165 + " Added 1443 neighbourhoods, \n",
     166 + " Sum of all nodes = 330985\n",
     167 + " Overhead ratio=203.05828220858896\n",
     168 + "\n"
     169 + ]
     170 + },
     171 + {
     172 + "name": "stderr",
     173 + "output_type": "stream",
     174 + "text": [
     175 + "Breaking down Giant CC (size 1313): 100%|██████████| 1313/1313 [00:00<00:00, 2181.59it/s]\n"
     176 + ]
     177 + },
     178 + {
     179 + "name": "stdout",
     180 + "output_type": "stream",
     181 + "text": [
     182 + "Giant Component of size 1313 was broken down.\n",
     183 + " Added 84 neighbourhoods, \n",
     184 + " Sum of all nodes = 16293\n",
     185 + " Overhead ratio=12.408987052551408\n",
     186 + "\n",
     187 + "CPU times: user 7min 51s, sys: 14.1 s, total: 8min 5s\n",
     188 + "Wall time: 8min 3s\n"
     189 + ]
     190 + }
     191 + ],
     192 + "source": [
     193 + "%%time\n",
     194 + "from data_cache.utils import GraphBuilder\n",
     195 + "gb = GraphBuilder()\n",
     196 + "gb.build(per, psc_company_df, psc_person_df, officer_df)\n",
     197 + "gb.break_into_subgraphs(1000)"
     198 + ]
     199 + },
     200 + {
     201 + "cell_type": "markdown",
     202 + "id": "17fb8fcd-feb8-4ee3-b561-804813d86dd0",
     203 + "metadata": {},
     204 + "source": [
     205 + "### Add node describtors for risk calculation later:"
     206 + ]
     207 + },
     208 + {
     209 + "cell_type": "code",
     210 + "execution_count": 7,
     211 + "id": "503789de-af85-466b-890d-c379e8da8c4e",
     212 + "metadata": {
     213 + "iooxa": {
     214 + "id": {
     215 + "block": "Dag1oRYbibJPCEAO30P2",
     216 + "project": "cNRcIFOMOBAHj5O57Joq",
     217 + "version": 1
     218 + },
     219 + "outputId": {
     220 + "block": "jzcbgB4EZdoFec0VdJhm",
     221 + "project": "cNRcIFOMOBAHj5O57Joq",
     222 + "version": 1
     223 + }
     224 + }
     225 + },
     226 + "outputs": [
     227 + {
     228 + "name": "stderr",
     229 + "output_type": "stream",
     230 + "text": [
     231 + "company: 100%|██████████| 5107631/5107631 [00:46<00:00, 110139.75it/s]\n",
     232 + "officer_person: 100%|██████████| 10035057/10035057 [00:29<00:00, 342760.19it/s]\n",
     233 + "officer_company: 100%|██████████| 313158/313158 [00:00<00:00, 626877.37it/s]\n",
     234 + "psc_person: 100%|██████████| 9012596/9012596 [00:22<00:00, 397809.45it/s]\n",
     235 + "psc_company: 100%|██████████| 702472/702472 [00:01<00:00, 491691.74it/s]\n"
     236 + ]
     237 + },
     238 + {
     239 + "name": "stdout",
     240 + "output_type": "stream",
     241 + "text": [
     242 + "CPU times: user 1min 51s, sys: 5.37 s, total: 1min 57s\n",
     243 + "Wall time: 1min 56s\n"
     244 + ]
     245 + }
     246 + ],
     247 + "source": [
     248 + "%%time\n",
     249 + "%load_ext autoreload\n",
     250 + "%autoreload 2\n",
     251 + "from data_cache.utils import NodeDescriber\n",
     252 + "\n",
     253 + "nd = NodeDescriber(per)\n",
     254 + "nd.add_metadata(company_df, officer_df, psc_company_df, psc_person_df)"
     255 + ]
     256 + },
     257 + {
     258 + "cell_type": "markdown",
     259 + "id": "76448310-b857-459d-b6e1-6185dc908d9a",
     260 + "metadata": {},
     261 + "source": [
     262 + "### Find if entities have possible matches against Politicians datasets"
     263 + ]
     264 + },
     265 + {
     266 + "cell_type": "code",
     267 + "execution_count": null,
     268 + "id": "7f4c8507-2805-4706-aaf8-07899b177da9",
     269 + "metadata": {},
     270 + "outputs": [],
     271 + "source": [
     272 + "PEP, RUS = {}, {}\n",
     273 + "for name, dob, country in zip(all_politicians.NAME, all_politicians.DOB, all_politicians.COUNTRY):\n",
     274 + " name = unidecode(name).lower()\n",
     275 + " PEP[name] = {\"country\": country, \"source\": \"every_politician\"}\n",
     276 + " if isinstance(dob, str):\n",
     277 + " PEP[name][\"yob\"] = int(dob[:4])\n",
     278 + " if len(dob) == 10:\n",
     279 + " PEP[name][\"mob\"] = int(dob[5:7])\n",
     280 + "for name, dob, cat, tx in zip(ru_bl_peps.NAME_EN, ru_bl_peps.DOB, ru_bl_peps.CATEGORY, ru_bl_peps.TAXPAYER_NUM):\n",
     281 + " name = unidecode(name).lower()\n",
     282 + " RUS[name] = {\"country\": \"RU/BY\", \"category\": cat, \"taxpayer_num\": tx, \"source\": \"rupep.org\"}\n",
     283 + " if isinstance(dob, str) and len(dob) == 10:\n",
     284 + " RUS[name][\"yob\"] = int(dob[-4:])\n",
     285 + " RUS[name][\"mob\"] = int(dob[3:5])"
     286 + ]
     287 + },
     288 + {
     289 + "cell_type": "markdown",
     290 + "id": "276470f1-084f-44c7-bc30-fa3a6905c170",
     291 + "metadata": {
     292 + "iooxa": {
     293 + "id": {
     294 + "block": "27gIHsImOPFu9FiC5HJe",
     295 + "project": "cNRcIFOMOBAHj5O57Joq",
     296 + "version": 1
     297 + }
     298 + }
     299 + },
     300 + "source": [
     301 + "## Build subnetwork stats:"
     302 + ]
     303 + },
     304 + {
     305 + "cell_type": "code",
     306 + "execution_count": 12,
     307 + "id": "b78891af-f60f-415f-8fc4-dbd0ab0487f6",
     308 + "metadata": {
     309 + "iooxa": {
     310 + "id": {
     311 + "block": "EETu5XmgZCjgCT2mtRmt",
     312 + "project": "cNRcIFOMOBAHj5O57Joq",
     313 + "version": 1
     314 + },
     315 + "outputId": {
     316 + "block": "zkzN2j3pJpxUBmPjXebN",
     317 + "project": "cNRcIFOMOBAHj5O57Joq",
     318 + "version": 1
     319 + }
     320 + },
     321 + "tags": []
     322 + },
     323 + "outputs": [
     324 + {
     325 + "name": "stderr",
     326 + "output_type": "stream",
     327 + "text": [
     328 + "Precomputing risk signals: 100%|██████████| 1000000/1000000 [50:37<00:00, 329.22it/s] \n"
     329 + ]
     330 + }
     331 + ],
     332 + "source": [
     333 + "from tqdm import tqdm \n",
     334 + "import numpy as np\n",
     335 + "from utils import TAX_HEAVENS\n",
     336 + "\n",
     337 + "# How many networks to cache. UKCH Total in 2022 is about 7M.\n",
     338 + "N = 1_000_000\n",
     339 + "PROXY_TH = 50\n",
     340 + "PARTITION_SIZE = 1000\n",
     341 + "\n",
     342 + "subnetwork_ids = list(gb.hash_to_subn_map.keys())[:N]\n",
     343 + "\n",
     344 + "def count_company_ratio(ns):\n",
     345 + " return np.mean([not n.startswith(\"p|\") for n in ns])\n",
     346 + "\n",
     347 + "def calculate_cyclicity(H):\n",
     348 + " if H.number_of_nodes() < 1:\n",
     349 + " print(f\"Non existent network: {netws}\")\n",
     350 + " return 0\n",
     351 + " E = H.number_of_edges()\n",
     352 + " N = H.number_of_nodes()\n",
     353 + " return (E + 1 - N)/(N*np.log(N))\n",
     354 + "\n",
     355 + "def netw_names(names):\n",
     356 + " names = set(names) - {None}\n",
     357 + " return \", \".join(sorted(names))\n",
     358 + "\n",
     359 + "def metadata_converter(md):\n",
     360 + " return \"; \".join([f\"{k}: {v}\" for k, v in md.items()])\n",
     361 + "\n",
     362 + "clc, node_num, dfs, company_ratio, entity_names, multi_jurisdiction, jur_names, netw_tax_haven = [], [], [], [], [], [], [], []\n",
     363 + "nodes, proxy, is_person, tax_haven, jur, node_metadata, netws = [], [], [], [], [], [], []\n",
     364 + "pep, pepm, r, rm, netw_pep, netw_r = [], [], [], [], [], []\n",
     365 + "for _id in tqdm(subnetwork_ids[:N], desc= \"Precomputing risk signals\"):\n",
     366 + " \n",
     367 + " # Get networkx subgraph:\n",
     368 + " nw = gb.hash_to_subn_map[_id]\n",
     369 + " H = gb.G_undir.subgraph(nw)\n",
     370 + " \n",
     371 + " # Network:\n",
     372 + " clc.append(calculate_cyclicity(H))\n",
     373 + " node_num.append(len(nw))\n",
     374 + " company_ratio.append(count_company_ratio(nw))\n",
     375 + " \n",
     376 + " # Edges:\n",
     377 + " df = pd.DataFrame(H.edges.data(\"edge_type\"), columns =['source', 'target', 'type'])\n",
     378 + " df['subgraph_hash'] =_id\n",
     379 + " df['subgraph_partition'] =_id % PARTITION_SIZE\n",
     380 + " dfs.append(df)\n",
     381 + " \n",
     382 + " # Nodes:\n",
     383 + " ns = gb.hash_to_subn_map[_id]\n",
     384 + " netw_jurs, netw_entity_names = [], []\n",
     385 + " netw_pep_value, netw_r_value = 0, 0\n",
     386 + " for n in ns:\n",
     387 + " nodes.append(n)\n",
     388 + " netws.append(_id)\n",
     389 + " proxy.append(int(gb.G_undir.degree[n] > PROXY_TH))\n",
     390 + " is_person.append(int(n.startswith(\"p|\")))\n",
     391 + " node_metadata.append(nd.node_to_metadata.get(n, None))\n",
     392 + " \n",
     393 + " j = nd.node_to_jurs.get(n, set())\n",
     394 + " jur.append(\", \".join(sorted(j)))\n",
     395 + " tax_haven.append(int(len(j.intersection(TAX_HEAVENS)) > 0))\n",
     396 + " \n",
     397 + " netw_jurs.append(nd.node_to_jurs.get(n, None))\n",
     398 + " \n",
     399 + " name = nd.node_to_names.get(n, None)\n",
     400 + " netw_entity_names.append(name)\n",
     401 + " \n",
     402 + " if name is not None and name in RUS:\n",
     403 + " r.append(1)\n",
     404 + " rm.append(metadata_converter(RUS[name]))\n",
     405 + " netw_r_value += 1\n",
     406 + " else:\n",
     407 + " r.append(0)\n",
     408 + " rm.append(\"\")\n",
     409 + " \n",
     410 + " if name is not None and name in PEP:\n",
     411 + " pep.append(1)\n",
     412 + " pepm.append(metadata_converter(PEP[name]))\n",
     413 + " netw_pep_value += 1\n",
     414 + " else:\n",
     415 + " pep.append(0)\n",
     416 + " pepm.append(\"\")\n",
     417 + "\n",
     418 + " netw_pep.append(netw_pep_value)\n",
     419 + " netw_r.append(netw_r_value)\n",
     420 + " jurs_in_subnetwork = list(set().union(*[n for n in netw_jurs if n is not None]))\n",
     421 + " jur_names.append(\"; \".join(jurs_in_subnetwork))\n",
     422 + " netw_tax_haven.append(int(len(set(jurs_in_subnetwork).intersection(TAX_HEAVENS))> 0))\n",
     423 + " multi_jurisdiction.append(int(len(jurs_in_subnetwork) > 1))\n",
     424 + " entity_names.append(netw_names(netw_entity_names))\n"
     425 + ]
     426 + },
     427 + {
     428 + "cell_type": "code",
     429 + "execution_count": 17,
     430 + "id": "efa05093-8e03-44d0-922f-e8e37302bc40",
     431 + "metadata": {},
     432 + "outputs": [
     433 + {
     434 + "name": "stdout",
     435 + "output_type": "stream",
     436 + "text": [
     437 + "CPU times: user 4min, sys: 12.2 s, total: 4min 12s\n",
     438 + "Wall time: 4min 11s\n"
     439 + ]
     440 + }
     441 + ],
     442 + "source": [
     443 + "%%time\n",
     444 + "PROXY_NETW_ID = set(nodes_df[nodes_df.proxy_dir==1].subgraph_hash)\n",
     445 + "subnetwork_df[\"proxy\"] = [int(s in PROXY_NETW_ID) for s in subnetwork_df.network_id.tolist()]\n",
     446 + "subnetwork_df = pd.DataFrame(data = {\n",
     447 + " \"network_id\": subnetwork_ids, \n",
     448 + " \"cyclicity\": clc, \n",
     449 + " \"node_num\": node_num, \n",
     450 + " \"company_ratio\": company_ratio,\n",
     451 + " \"multi_jurisdiction\": multi_jurisdiction, \n",
     452 + " \"tax_haven\": netw_tax_haven, \n",
     453 + " \"potential_pep_match\": netw_pep, \n",
     454 + " \"potential_rus_pep_match\": netw_r, \n",
     455 + " \"entity_names\": entity_names, \n",
     456 + " \"jur_names\": jur_names, \n",
     457 + "})\n",
     458 + "nodes_df = pd.DataFrame(data = {\n",
     459 + " 'node_id': nodes, \n",
     460 + " 'subgraph_hash': netws, \n",
     461 + " \"is_person\": is_person,\n",
     462 + " \"proxy_dir\": proxy, \n",
     463 + " \"node_metadata\": node_metadata,\n",
     464 + " \"tax_haven\": tax_haven, \n",
     465 + " \"jur\": jur,\n",
     466 + " \"politician\": pep, \n",
     467 + " \"politician_metadata\": pepm, \n",
     468 + " \"rus_politician\": r,\n",
     469 + " \"rus_politician_metadata\": rm, \n",
     470 + "})\n",
     471 + "edges_df = pd.concat(dfs)"
     472 + ]
     473 + },
     474 + {
     475 + "cell_type": "code",
     476 + "execution_count": 38,
     477 + "id": "3d6af603-5fa9-4eb9-8ce0-52a75ff27a44",
     478 + "metadata": {},
     479 + "outputs": [
     480 + {
     481 + "data": {
     482 + "text/plain": [
     483 + "((81482446, 5), (62008292, 12), (1000000, 11))"
     484 + ]
     485 + },
     486 + "execution_count": 38,
     487 + "metadata": {},
     488 + "output_type": "execute_result"
     489 + }
     490 + ],
     491 + "source": [
     492 + "edges_df.shape, nodes_df.shape, subnetwork_df.shape"
     493 + ]
     494 + },
     495 + {
     496 + "cell_type": "code",
     497 + "execution_count": 35,
     498 + "id": "6164657c-f196-43b2-9c14-bc8bdce811fd",
     499 + "metadata": {},
     500 + "outputs": [],
     501 + "source": [
     502 + "subnetwork_df.to_parquet(schema['output_nodes'])\n",
     503 + "edges_df.to_parquet(schema['output_nodes'], partition_cols = [\"subgraph_partition\"])\n",
     504 + "nodes_df['subgraph_partition'] = nodes_df.subgraph_hash.apply(lambda x: x%1000)\n",
     505 + "nodes_df.to_parquet(schema['output_nodes'], partition_cols = [\"subgraph_partition\"])"
     506 + ]
     507 + },
     508 + {
     509 + "cell_type": "code",
     510 + "execution_count": null,
     511 + "id": "5a26567c-2a19-484b-98bb-1bb4ccc154a9",
     512 + "metadata": {},
     513 + "outputs": [],
     514 + "source": []
     515 + }
     516 + ],
     517 + "metadata": {
     518 + "iooxa": {
     519 + "id": {
     520 + "block": "VE4MftXdmZ856rMTec6u",
     521 + "project": "cNRcIFOMOBAHj5O57Joq",
     522 + "version": 1
     523 + }
     524 + },
     525 + "kernelspec": {
     526 + "display_name": "Python 3",
     527 + "language": "python",
     528 + "name": "python3"
     529 + },
     530 + "language_info": {
     531 + "codemirror_mode": {
     532 + "name": "ipython",
     533 + "version": 3
     534 + },
     535 + "file_extension": ".py",
     536 + "mimetype": "text/x-python",
     537 + "name": "python",
     538 + "nbconvert_exporter": "python",
     539 + "pygments_lexer": "ipython3",
     540 + "version": "3.8.7"
     541 + }
     542 + },
     543 + "nbformat": 4,
     544 + "nbformat_minor": 5
     545 +}
     546 + 
  • ■ ■ ■ ■ ■ ■
    data_cache/DATA_SCHEMA_README.md
     1 +# 451 Corporate Risk Miner Datasets used:
     2 + 
     3 +## Datasets/Data Sources Used:
     4 + 
     5 +UKCH Company House Datasets
     6 + 
     7 +- UKCH Company Dataset, downloaded from: http://download.companieshouse.gov.uk/en_output.html
     8 +- UKCH PSC Company Dataset, downloaded from: http://download.companieshouse.gov.uk/en_pscdata.html
     9 +- UKCH Officers Dataset, scraped from: https://developer-specs.company-information.service.gov.uk/companies-house-public-data-api/reference/officers/list
     10 +- Every politician, downloaded from: https://everypolitician.org/countries.html
     11 +- Russian peps, downloaded from: https://rupep.org/ru/persons_list/
     12 + 
     13 +## Input Schema:
     14 +Every entry in every dataset, gets assigned a unique `mention_id` that is a concatenation of a dataset and a row.
     15 + 
     16 +### UK CH Company Dataset is consumed as company_df with following columns:
     17 +#### `company_df`
     18 +- company_number
     19 +- company_name
     20 +- country
     21 +- industry_code
     22 +- address
     23 + 
     24 +### UK CH PSC Company Dataset is consumed by splitting people and company owners apart:
     25 + 
     26 +#### `psc_company_df`:
     27 +- name
     28 +- combined_address
     29 +- kind
     30 +- company_number
     31 +- psc_derived_company_number (this is the company_house id of the owning entity)
     32 + 
     33 +#### `psc_company_df`:
     34 +- name
     35 +- name_elements_middle_name
     36 +- name_elements_forename
     37 +- name_elements_surname
     38 +- nationality
     39 +- address_postal_code
     40 +- date_of_birth_year
     41 +- date_of_birth_month
     42 +- company_number
     43 + 
     44 +### UK CH Company Officers:
     45 +#### `officer_df`:
     46 +- forenames
     47 +- surname
     48 +- nationality
     49 +- yob
     50 +- mob
     51 +- appointment_role
     52 +- post_town
     53 +- postal_code
     54 +- country
     55 + 
  • ■ ■ ■ ■ ■ ■
    data_cache/build_data_cache.ipynb
     1 +{
     2 + "cells": [
     3 + {
     4 + "cell_type": "markdown",
     5 + "id": "9b424ba0-9394-4411-a475-cffe1d9c7fce",
     6 + "metadata": {},
     7 + "source": [
     8 + "## Create Data Cache:"
     9 + ]
     10 + },
     11 + {
     12 + "cell_type": "code",
     13 + "execution_count": 5,
     14 + "id": "d77b1094-5c00-4c46-8065-0e9892e15028",
     15 + "metadata": {
     16 + "iooxa": {
     17 + "id": {
     18 + "block": "FrkUPeD1VXG9V8zSSfh8",
     19 + "project": "cNRcIFOMOBAHj5O57Joq",
     20 + "version": 1
     21 + },
     22 + "outputId": {
     23 + "block": "zlAgkbC1M5B0QnHjKdn0",
     24 + "project": "cNRcIFOMOBAHj5O57Joq",
     25 + "version": 1
     26 + }
     27 + }
     28 + },
     29 + "outputs": [
     30 + {
     31 + "name": "stderr",
     32 + "output_type": "stream",
     33 + "text": [
     34 + "UKCH Company er map: 100%|██████████| 5107631/5107631 [00:05<00:00, 953235.19it/s] \n",
     35 + "Officer (person) er map: 100%|██████████| 10035057/10035057 [00:23<00:00, 422416.04it/s]\n",
     36 + "Officer (company) er map: 100%|██████████| 313158/313158 [00:00<00:00, 756575.34it/s]\n",
     37 + "PSC (company) er map: 100%|██████████| 702472/702472 [00:00<00:00, 859221.68it/s]\n",
     38 + "PSC (person) er map: 100%|██████████| 9012596/9012596 [00:25<00:00, 355636.44it/s]"
     39 + ]
     40 + },
     41 + {
     42 + "name": "stdout",
     43 + "output_type": "stream",
     44 + "text": [
     45 + "CPU times: user 2min 33s, sys: 48.9 s, total: 3min 21s\n",
     46 + "Wall time: 4min 29s\n"
     47 + ]
     48 + },
     49 + {
     50 + "name": "stderr",
     51 + "output_type": "stream",
     52 + "text": [
     53 + "\n"
     54 + ]
     55 + }
     56 + ],
     57 + "source": [
     58 + "%%time\n",
     59 + "from data_cache.utils import ProduceEntityResolution\n",
     60 + "from data_cache.schema import schema\n",
     61 + "\n",
     62 + "from unidecode import unidecode\n",
     63 + "import pandas as pd\n",
     64 + "\n",
     65 + "# For more info on sources, see data_cache/DATA_SCHEMA_README:\n",
     66 + "company_df = pd.read_parquet(schema['ukch_companies'])\n",
     67 + "officer_df = pd.read_parquet(schema['ukch_officers'])\n",
     68 + "psc_company_df = pd.read_parquet(schema['psc_company'])\n",
     69 + "psc_person_df = pd.read_parquet(schema['psc_person'])\n",
     70 + "\n",
     71 + "all_politicians = pd.read_csv(schema['politicians_parsed'])\n",
     72 + "all_politicians = all_politicians.where(pd.notnull(all_politicians), None)\n",
     73 + "\n",
     74 + "ru_bl_peps = pd.read_csv(schema['ru_bl_peps_parsed'])\n",
     75 + "ru_bl_peps = ru_bl_peps.where(pd.notnull(ru_bl_peps), None)\n",
     76 + "\n",
     77 + "un_sanctions = pd.read_csv(schema['un_parsed'])\n",
     78 + "un_sanctions = un_sanctions.where(pd.notnull(un_sanctions), None)\n",
     79 + "\n",
     80 + "per = ProduceEntityResolution(company_df)\n",
     81 + "per.resolve_entities(company_df, officer_df, psc_company_df, psc_person_df)"
     82 + ]
     83 + },
     84 + {
     85 + "cell_type": "markdown",
     86 + "id": "eb76af3a-8a7b-4864-b034-f30ba8d69e4e",
     87 + "metadata": {
     88 + "iooxa": {
     89 + "id": {
     90 + "block": "8Ky77gWt42j1Xau0trNJ",
     91 + "project": "cNRcIFOMOBAHj5O57Joq",
     92 + "version": 1
     93 + }
     94 + }
     95 + },
     96 + "source": [
     97 + "## Graph Building and Breaking down into subnetworks:"
     98 + ]
     99 + },
     100 + {
     101 + "cell_type": "code",
     102 + "execution_count": 6,
     103 + "id": "af6a9b94-1c68-4fa1-959d-db497d796de3",
     104 + "metadata": {
     105 + "iooxa": {
     106 + "id": {
     107 + "block": "Z2iDuhiAMrSLfhPmW232",
     108 + "project": "cNRcIFOMOBAHj5O57Joq",
     109 + "version": 1
     110 + },
     111 + "outputId": {
     112 + "block": "as1z2tZl1DsTqxwV8anY",
     113 + "project": "cNRcIFOMOBAHj5O57Joq",
     114 + "version": 1
     115 + }
     116 + }
     117 + },
     118 + "outputs": [
     119 + {
     120 + "name": "stderr",
     121 + "output_type": "stream",
     122 + "text": [
     123 + "PSC company graph: 702472it [00:03, 180621.51it/s]\n",
     124 + "PSC person graph: 9012596it [01:01, 145691.31it/s]\n",
     125 + "Officer graph: 10348215it [00:42, 241512.75it/s]\n"
     126 + ]
     127 + },
     128 + {
     129 + "name": "stdout",
     130 + "output_type": "stream",
     131 + "text": [
     132 + "Top 10 Connected component sizes: [4373053, 1630, 1313, 760, 753, 512, 430, 409, 355, 346]\n"
     133 + ]
     134 + },
     135 + {
     136 + "name": "stderr",
     137 + "output_type": "stream",
     138 + "text": [
     139 + "Breaking down Giant CC (size 4373053): 100%|██████████| 4373053/4373053 [05:18<00:00, 13723.90it/s]\n"
     140 + ]
     141 + },
     142 + {
     143 + "name": "stdout",
     144 + "output_type": "stream",
     145 + "text": [
     146 + "Giant Component of size 4373053 was broken down.\n",
     147 + " Added 3139321 neighbourhoods, \n",
     148 + " Sum of all nodes = 217725726\n",
     149 + " Overhead ratio=49.78803732769761\n",
     150 + "\n"
     151 + ]
     152 + },
     153 + {
     154 + "name": "stderr",
     155 + "output_type": "stream",
     156 + "text": [
     157 + "Breaking down Giant CC (size 1630): 100%|██████████| 1630/1630 [00:00<00:00, 8143.89it/s]\n"
     158 + ]
     159 + },
     160 + {
     161 + "name": "stdout",
     162 + "output_type": "stream",
     163 + "text": [
     164 + "Giant Component of size 1630 was broken down.\n",
     165 + " Added 1443 neighbourhoods, \n",
     166 + " Sum of all nodes = 330985\n",
     167 + " Overhead ratio=203.05828220858896\n",
     168 + "\n"
     169 + ]
     170 + },
     171 + {
     172 + "name": "stderr",
     173 + "output_type": "stream",
     174 + "text": [
     175 + "Breaking down Giant CC (size 1313): 100%|██████████| 1313/1313 [00:00<00:00, 2181.59it/s]\n"
     176 + ]
     177 + },
     178 + {
     179 + "name": "stdout",
     180 + "output_type": "stream",
     181 + "text": [
     182 + "Giant Component of size 1313 was broken down.\n",
     183 + " Added 84 neighbourhoods, \n",
     184 + " Sum of all nodes = 16293\n",
     185 + " Overhead ratio=12.408987052551408\n",
     186 + "\n",
     187 + "CPU times: user 7min 51s, sys: 14.1 s, total: 8min 5s\n",
     188 + "Wall time: 8min 3s\n"
     189 + ]
     190 + }
     191 + ],
     192 + "source": [
     193 + "%%time\n",
     194 + "from data_cache.utils import GraphBuilder\n",
     195 + "gb = GraphBuilder()\n",
     196 + "gb.build(per, psc_company_df, psc_person_df, officer_df)\n",
     197 + "gb.break_into_subgraphs(1000)"
     198 + ]
     199 + },
     200 + {
     201 + "cell_type": "markdown",
     202 + "id": "17fb8fcd-feb8-4ee3-b561-804813d86dd0",
     203 + "metadata": {},
     204 + "source": [
     205 + "### Add node describtors for risk calculation later:"
     206 + ]
     207 + },
     208 + {
     209 + "cell_type": "code",
     210 + "execution_count": 7,
     211 + "id": "503789de-af85-466b-890d-c379e8da8c4e",
     212 + "metadata": {
     213 + "iooxa": {
     214 + "id": {
     215 + "block": "Dag1oRYbibJPCEAO30P2",
     216 + "project": "cNRcIFOMOBAHj5O57Joq",
     217 + "version": 1
     218 + },
     219 + "outputId": {
     220 + "block": "jzcbgB4EZdoFec0VdJhm",
     221 + "project": "cNRcIFOMOBAHj5O57Joq",
     222 + "version": 1
     223 + }
     224 + }
     225 + },
     226 + "outputs": [
     227 + {
     228 + "name": "stderr",
     229 + "output_type": "stream",
     230 + "text": [
     231 + "company: 100%|██████████| 5107631/5107631 [00:46<00:00, 110139.75it/s]\n",
     232 + "officer_person: 100%|██████████| 10035057/10035057 [00:29<00:00, 342760.19it/s]\n",
     233 + "officer_company: 100%|██████████| 313158/313158 [00:00<00:00, 626877.37it/s]\n",
     234 + "psc_person: 100%|██████████| 9012596/9012596 [00:22<00:00, 397809.45it/s]\n",
     235 + "psc_company: 100%|██████████| 702472/702472 [00:01<00:00, 491691.74it/s]\n"
     236 + ]
     237 + },
     238 + {
     239 + "name": "stdout",
     240 + "output_type": "stream",
     241 + "text": [
     242 + "CPU times: user 1min 51s, sys: 5.37 s, total: 1min 57s\n",
     243 + "Wall time: 1min 56s\n"
     244 + ]
     245 + }
     246 + ],
     247 + "source": [
     248 + "%%time\n",
     249 + "%load_ext autoreload\n",
     250 + "%autoreload 2\n",
     251 + "from data_cache.utils import NodeDescriber\n",
     252 + "\n",
     253 + "nd = NodeDescriber(per)\n",
     254 + "nd.add_metadata(company_df, officer_df, psc_company_df, psc_person_df)"
     255 + ]
     256 + },
     257 + {
     258 + "cell_type": "markdown",
     259 + "id": "76448310-b857-459d-b6e1-6185dc908d9a",
     260 + "metadata": {},
     261 + "source": [
     262 + "### Find if entities have possible matches against Politicians datasets"
     263 + ]
     264 + },
     265 + {
     266 + "cell_type": "code",
     267 + "execution_count": null,
     268 + "id": "7f4c8507-2805-4706-aaf8-07899b177da9",
     269 + "metadata": {},
     270 + "outputs": [],
     271 + "source": [
     272 + "PEP, RUS = {}, {}\n",
     273 + "for name, dob, country in zip(all_politicians.NAME, all_politicians.DOB, all_politicians.COUNTRY):\n",
     274 + " name = unidecode(name).lower()\n",
     275 + " PEP[name] = {\"country\": country, \"source\": \"every_politician\"}\n",
     276 + " if isinstance(dob, str):\n",
     277 + " PEP[name][\"yob\"] = int(dob[:4])\n",
     278 + " if len(dob) == 10:\n",
     279 + " PEP[name][\"mob\"] = int(dob[5:7])\n",
     280 + "for name, dob, cat, tx in zip(ru_bl_peps.NAME_EN, ru_bl_peps.DOB, ru_bl_peps.CATEGORY, ru_bl_peps.TAXPAYER_NUM):\n",
     281 + " name = unidecode(name).lower()\n",
     282 + " RUS[name] = {\"country\": \"RU/BY\", \"category\": cat, \"taxpayer_num\": tx, \"source\": \"rupep.org\"}\n",
     283 + " if isinstance(dob, str) and len(dob) == 10:\n",
     284 + " RUS[name][\"yob\"] = int(dob[-4:])\n",
     285 + " RUS[name][\"mob\"] = int(dob[3:5])"
     286 + ]
     287 + },
     288 + {
     289 + "cell_type": "markdown",
     290 + "id": "276470f1-084f-44c7-bc30-fa3a6905c170",
     291 + "metadata": {
     292 + "iooxa": {
     293 + "id": {
     294 + "block": "27gIHsImOPFu9FiC5HJe",
     295 + "project": "cNRcIFOMOBAHj5O57Joq",
     296 + "version": 1
     297 + }
     298 + }
     299 + },
     300 + "source": [
     301 + "## Build subnetwork stats:"
     302 + ]
     303 + },
     304 + {
     305 + "cell_type": "code",
     306 + "execution_count": 12,
     307 + "id": "b78891af-f60f-415f-8fc4-dbd0ab0487f6",
     308 + "metadata": {
     309 + "iooxa": {
     310 + "id": {
     311 + "block": "EETu5XmgZCjgCT2mtRmt",
     312 + "project": "cNRcIFOMOBAHj5O57Joq",
     313 + "version": 1
     314 + },
     315 + "outputId": {
     316 + "block": "zkzN2j3pJpxUBmPjXebN",
     317 + "project": "cNRcIFOMOBAHj5O57Joq",
     318 + "version": 1
     319 + }
     320 + },
     321 + "tags": []
     322 + },
     323 + "outputs": [
     324 + {
     325 + "name": "stderr",
     326 + "output_type": "stream",
     327 + "text": [
     328 + "Precomputing risk signals: 100%|██████████| 1000000/1000000 [50:37<00:00, 329.22it/s] \n"
     329 + ]
     330 + }
     331 + ],
     332 + "source": [
     333 + "from tqdm import tqdm \n",
     334 + "import numpy as np\n",
     335 + "from utils import TAX_HEAVENS\n",
     336 + "\n",
     337 + "# How many networks to cache. UKCH Total in 2022 is about 7M.\n",
     338 + "N = 1_000_000\n",
     339 + "PROXY_TH = 50\n",
     340 + "PARTITION_SIZE = 1000\n",
     341 + "\n",
     342 + "subnetwork_ids = list(gb.hash_to_subn_map.keys())[:N]\n",
     343 + "\n",
     344 + "def count_company_ratio(ns):\n",
     345 + " return np.mean([not n.startswith(\"p|\") for n in ns])\n",
     346 + "\n",
     347 + "def calculate_cyclicity(H):\n",
     348 + " if H.number_of_nodes() < 1:\n",
     349 + " print(f\"Non existent network: {netws}\")\n",
     350 + " return 0\n",
     351 + " E = H.number_of_edges()\n",
     352 + " N = H.number_of_nodes()\n",
     353 + " return (E + 1 - N)/(N*np.log(N))\n",
     354 + "\n",
     355 + "def netw_names(names):\n",
     356 + " names = set(names) - {None}\n",
     357 + " return \", \".join(sorted(names))\n",
     358 + "\n",
     359 + "def metadata_converter(md):\n",
     360 + " return \"; \".join([f\"{k}: {v}\" for k, v in md.items()])\n",
     361 + "\n",
     362 + "clc, node_num, dfs, company_ratio, entity_names, multi_jurisdiction, jur_names, netw_tax_haven = [], [], [], [], [], [], [], []\n",
     363 + "nodes, proxy, is_person, tax_haven, jur, node_metadata, netws = [], [], [], [], [], [], []\n",
     364 + "pep, pepm, r, rm, netw_pep, netw_r = [], [], [], [], [], []\n",
     365 + "for _id in tqdm(subnetwork_ids[:N], desc= \"Precomputing risk signals\"):\n",
     366 + " \n",
     367 + " # Get networkx subgraph:\n",
     368 + " nw = gb.hash_to_subn_map[_id]\n",
     369 + " H = gb.G_undir.subgraph(nw)\n",
     370 + " \n",
     371 + " # Network:\n",
     372 + " clc.append(calculate_cyclicity(H))\n",
     373 + " node_num.append(len(nw))\n",
     374 + " company_ratio.append(count_company_ratio(nw))\n",
     375 + " \n",
     376 + " # Edges:\n",
     377 + " df = pd.DataFrame(H.edges.data(\"edge_type\"), columns =['source', 'target', 'type'])\n",
     378 + " df['subgraph_hash'] =_id\n",
     379 + " df['subgraph_partition'] =_id % PARTITION_SIZE\n",
     380 + " dfs.append(df)\n",
     381 + " \n",
     382 + " # Nodes:\n",
     383 + " ns = gb.hash_to_subn_map[_id]\n",
     384 + " netw_jurs, netw_entity_names = [], []\n",
     385 + " netw_pep_value, netw_r_value = 0, 0\n",
     386 + " for n in ns:\n",
     387 + " nodes.append(n)\n",
     388 + " netws.append(_id)\n",
     389 + " proxy.append(int(gb.G_undir.degree[n] > PROXY_TH))\n",
     390 + " is_person.append(int(n.startswith(\"p|\")))\n",
     391 + " node_metadata.append(nd.node_to_metadata.get(n, None))\n",
     392 + " \n",
     393 + " j = nd.node_to_jurs.get(n, set())\n",
     394 + " jur.append(\", \".join(sorted(j)))\n",
     395 + " tax_haven.append(int(len(j.intersection(TAX_HEAVENS)) > 0))\n",
     396 + " \n",
     397 + " netw_jurs.append(nd.node_to_jurs.get(n, None))\n",
     398 + " \n",
     399 + " name = nd.node_to_names.get(n, None)\n",
     400 + " netw_entity_names.append(name)\n",
     401 + " \n",
     402 + " if name is not None and name in RUS:\n",
     403 + " r.append(1)\n",
     404 + " rm.append(metadata_converter(RUS[name]))\n",
     405 + " netw_r_value += 1\n",
     406 + " else:\n",
     407 + " r.append(0)\n",
     408 + " rm.append(\"\")\n",
     409 + " \n",
     410 + " if name is not None and name in PEP:\n",
     411 + " pep.append(1)\n",
     412 + " pepm.append(metadata_converter(PEP[name]))\n",
     413 + " netw_pep_value += 1\n",
     414 + " else:\n",
     415 + " pep.append(0)\n",
     416 + " pepm.append(\"\")\n",
     417 + "\n",
     418 + " netw_pep.append(netw_pep_value)\n",
     419 + " netw_r.append(netw_r_value)\n",
     420 + " jurs_in_subnetwork = list(set().union(*[n for n in netw_jurs if n is not None]))\n",
     421 + " jur_names.append(\"; \".join(jurs_in_subnetwork))\n",
     422 + " netw_tax_haven.append(int(len(set(jurs_in_subnetwork).intersection(TAX_HEAVENS))> 0))\n",
     423 + " multi_jurisdiction.append(int(len(jurs_in_subnetwork) > 1))\n",
     424 + " entity_names.append(netw_names(netw_entity_names))\n"
     425 + ]
     426 + },
     427 + {
     428 + "cell_type": "code",
     429 + "execution_count": 17,
     430 + "id": "efa05093-8e03-44d0-922f-e8e37302bc40",
     431 + "metadata": {},
     432 + "outputs": [
     433 + {
     434 + "name": "stdout",
     435 + "output_type": "stream",
     436 + "text": [
     437 + "CPU times: user 4min, sys: 12.2 s, total: 4min 12s\n",
     438 + "Wall time: 4min 11s\n"
     439 + ]
     440 + }
     441 + ],
     442 + "source": [
     443 + "%%time\n",
     444 + "PROXY_NETW_ID = set(nodes_df[nodes_df.proxy_dir==1].subgraph_hash)\n",
     445 + "subnetwork_df[\"proxy\"] = [int(s in PROXY_NETW_ID) for s in subnetwork_df.network_id.tolist()]\n",
     446 + "subnetwork_df = pd.DataFrame(data = {\n",
     447 + " \"network_id\": subnetwork_ids, \n",
     448 + " \"cyclicity\": clc, \n",
     449 + " \"node_num\": node_num, \n",
     450 + " \"company_ratio\": company_ratio,\n",
     451 + " \"multi_jurisdiction\": multi_jurisdiction, \n",
     452 + " \"tax_haven\": netw_tax_haven, \n",
     453 + " \"potential_pep_match\": netw_pep, \n",
     454 + " \"potential_rus_pep_match\": netw_r, \n",
     455 + " \"entity_names\": entity_names, \n",
     456 + " \"jur_names\": jur_names, \n",
     457 + "})\n",
     458 + "nodes_df = pd.DataFrame(data = {\n",
     459 + " 'node_id': nodes, \n",
     460 + " 'subgraph_hash': netws, \n",
     461 + " \"is_person\": is_person,\n",
     462 + " \"proxy_dir\": proxy, \n",
     463 + " \"node_metadata\": node_metadata,\n",
     464 + " \"tax_haven\": tax_haven, \n",
     465 + " \"jur\": jur,\n",
     466 + " \"politician\": pep, \n",
     467 + " \"politician_metadata\": pepm, \n",
     468 + " \"rus_politician\": r,\n",
     469 + " \"rus_politician_metadata\": rm, \n",
     470 + "})\n",
     471 + "edges_df = pd.concat(dfs)"
     472 + ]
     473 + },
     474 + {
     475 + "cell_type": "code",
     476 + "execution_count": 38,
     477 + "id": "3d6af603-5fa9-4eb9-8ce0-52a75ff27a44",
     478 + "metadata": {},
     479 + "outputs": [
     480 + {
     481 + "data": {
     482 + "text/plain": [
     483 + "((81482446, 5), (62008292, 12), (1000000, 11))"
     484 + ]
     485 + },
     486 + "execution_count": 38,
     487 + "metadata": {},
     488 + "output_type": "execute_result"
     489 + }
     490 + ],
     491 + "source": [
     492 + "edges_df.shape, nodes_df.shape, subnetwork_df.shape"
     493 + ]
     494 + },
     495 + {
     496 + "cell_type": "code",
     497 + "execution_count": 35,
     498 + "id": "6164657c-f196-43b2-9c14-bc8bdce811fd",
     499 + "metadata": {},
     500 + "outputs": [],
     501 + "source": [
     502 + "subnetwork_df.to_parquet(schema['output_nodes'])\n",
     503 + "edges_df.to_parquet(schema['output_nodes'], partition_cols = [\"subgraph_partition\"])\n",
     504 + "nodes_df['subgraph_partition'] = nodes_df.subgraph_hash.apply(lambda x: x%1000)\n",
     505 + "nodes_df.to_parquet(schema['output_nodes'], partition_cols = [\"subgraph_partition\"])"
     506 + ]
     507 + },
     508 + {
     509 + "cell_type": "code",
     510 + "execution_count": null,
     511 + "id": "5a26567c-2a19-484b-98bb-1bb4ccc154a9",
     512 + "metadata": {},
     513 + "outputs": [],
     514 + "source": []
     515 + }
     516 + ],
     517 + "metadata": {
     518 + "iooxa": {
     519 + "id": {
     520 + "block": "VE4MftXdmZ856rMTec6u",
     521 + "project": "cNRcIFOMOBAHj5O57Joq",
     522 + "version": 1
     523 + }
     524 + },
     525 + "kernelspec": {
     526 + "display_name": "Python 3",
     527 + "language": "python",
     528 + "name": "python3"
     529 + },
     530 + "language_info": {
     531 + "codemirror_mode": {
     532 + "name": "ipython",
     533 + "version": 3
     534 + },
     535 + "file_extension": ".py",
     536 + "mimetype": "text/x-python",
     537 + "name": "python",
     538 + "nbconvert_exporter": "python",
     539 + "pygments_lexer": "ipython3",
     540 + "version": "3.8.7"
     541 + }
     542 + },
     543 + "nbformat": 4,
     544 + "nbformat_minor": 5
     545 +}
     546 + 
  • ■ ■ ■ ■ ■ ■
    data_cache/data_schema.py
     1 +schema = {
     2 + "every_politician": "/sanctions_and_peps/parsed/politicians_parsed.csv",
     3 + "russian_politician": "/sanctions_and_peps/parsed/ru_bl_peps_parsed.csv",
     4 +# Source: UKCH Company house, see schema readme for more details:
     5 + "ukch_company": "/insert_download_folder/ukch_companies.parquet",
     6 + "ukch_officers": "/insert_download_folder/ukch_officers.parquet",
     7 + "ukch_psc_company": "/insert_download_folder/psc_company.parquet",
     8 + "ukch_psc_people": "/insert_download_folder/psc_person.parquet",
     9 +# Output locations for stream lit to consume:
     10 + "output_nodes": "/data/networks.parquet",
     11 + "output_edges": "/data/edges.parquet",
     12 + "output_networks": "/data/networks.parquet"
     13 +}
  • ■ ■ ■ ■ ■ ■
    data_cache/utils.py
     1 +import random
     2 +from tqdm import tqdm
     3 +import networkx as nx
     4 +from collections import defaultdict
     5 + 
     6 + 
     7 +TAX_HEAVENS = {
     8 + 'jersey',
     9 + 'luxembourg',
     10 + 'virgin islands, british',
     11 + 'cayman islands',
     12 + 'switzerland',
     13 + 'hong kong',
     14 + 'singapore',
     15 + 'guernsey',
     16 + 'bermuda',
     17 + 'seychelles',
     18 + 'united arab emirates',
     19 + 'marshall islands',
     20 + 'belize',
     21 + "isle of man",
     22 + "barbados",
     23 + "panama",
     24 + "bahamas",
     25 + "saint lucia",
     26 + "dominica",
     27 +}
     28 + 
     29 + 
     30 +class ProduceEntityResolution:
     31 + def __init__(self, company_df, max_rand=10 ** 8, min_len=7):
     32 + self.company_name_to_reg = {
     33 + n.upper().strip(): r for n, r in zip(company_df.company_name, company_df.company_number)
     34 + }
     35 + self.max_rand = max_rand
     36 + self.er_map = {}
     37 + self.min_len = min_len
     38 + 
     39 + def missing_entry_randomisation(self):
     40 + return random.randint(0, self.max_rand)
     41 + 
     42 + def get_person_identifier(self, forenames, surname, yob, mob):
     43 + if not surname:
     44 + return f"p|rand_{self.missing_entry_randomisation()}"
     45 + if forenames is None or len(forenames) == 0:
     46 + return f"p|rand_{self.missing_entry_randomisation()}"
     47 + elif isinstance(forenames, list):
     48 + name = " ".join(forenames).lower()
     49 + else:
     50 + name = forenames.lower()
     51 + return f"p|{name} {surname.lower()}|{yob}|{mob}"
     52 + 
     53 + def get_company_identifier(self, cname, cnum):
     54 + if cnum and len(cnum) >= self.min_len:
     55 + return f"c|{cnum}"
     56 + if cname in self.company_name_to_reg:
     57 + return f"c|{self.company_name_to_reg[cname]}"
     58 + if len(cname) > self.min_len:
     59 + return f"c|company_named_{cname}"
     60 + return f"c|rand_{self.missing_entry_randomisation()}"
     61 + 
     62 + def resolve_entities(self, company_df, officer_df, psc_company_df, psc_person_df):
     63 + 
     64 + # Company ER
     65 + inps = zip(company_df.mention_id, company_df.company_name, company_df.company_number)
     66 + for mid, cname, cnum in tqdm(inps, desc="UKCH Company er map", total=company_df.shape[0]):
     67 + self.er_map[mid] = self.get_company_identifier(cname, cnum)
     68 + 
     69 + off_person_df = officer_df[~officer_df.is_corporate_body]
     70 + inps = zip(
     71 + off_person_df.mention_id,
     72 + off_person_df.forenames,
     73 + off_person_df.surname,
     74 + off_person_df.yob,
     75 + off_person_df.mob,
     76 + )
     77 + for mid, fs, cs, yob, mob in tqdm(inps, desc="Officer (person) er map", total=off_person_df.shape[0]):
     78 + self.er_map[mid] = self.get_person_identifier(fs, cs, yob, mob)
     79 + 
     80 + off_company_df = officer_df[officer_df.is_corporate_body]
     81 + inps = zip(off_company_df.mention_id, off_company_df.surname)
     82 + for mid, cname in tqdm(inps, desc="Officer (company) er map", total=off_company_df.shape[0]):
     83 + self.er_map[mid] = self.get_company_identifier(cname, None)
     84 + 
     85 + # PSC Company ER
     86 + inps = zip(psc_company_df.mention_id, psc_company_df.name, psc_company_df.psc_derived_company_number)
     87 + for mid, cname, cnum in tqdm(inps, desc="PSC (company) er map", total=psc_company_df.shape[0]):
     88 + self.er_map[mid] = self.get_company_identifier(cname, cnum)
     89 + 
     90 + # People of significant control [physical people] ER:
     91 + inps = zip(
     92 + psc_person_df.mention_id,
     93 + psc_person_df.name_elements_forename,
     94 + psc_person_df.name_elements_middle_name,
     95 + psc_person_df.name_elements_surname,
     96 + psc_person_df.date_of_birth_year,
     97 + psc_person_df.date_of_birth_month,
     98 + )
     99 + for mid, cf, cm, cs, yob, mob in tqdm(inps, desc="PSC (person) er map", total=psc_person_df.shape[0]):
     100 + forenames = [x for x in [cf, cm] if x is not None]
     101 + self.er_map[mid] = self.get_person_identifier(forenames, cs, yob, mob)
     102 + 
     103 + 
     104 +class GraphBuilder:
     105 + def __init__(self):
     106 + self.G = nx.DiGraph()
     107 + self.G_undir = nx.Graph()
     108 + self.CCs = []
     109 + self.ccs_len = []
     110 + self.hash_to_subn_map = {}
     111 + 
     112 + def build_G(self, per, psc_company_df, psc_person_df, officer_df):
     113 + for target_eid, source_mid in tqdm(zip(psc_company_df.company_number, psc_company_df.mention_id),
     114 + desc="PSC company graph"):
     115 + self.G_undir.add_edge(per.er_map[source_mid], f"c|{target_eid}", edge_type="own")
     116 + 
     117 + for target_eid, source_mid in tqdm(zip(psc_person_df.company_number, psc_person_df.mention_id),
     118 + desc="PSC person graph"):
     119 + self.G_undir.add_edge(per.er_map[source_mid], f"c|{target_eid}", edge_type="own")
     120 + 
     121 + for target_eid, source_mid in tqdm(zip(officer_df.company_number, officer_df.mention_id), desc="Officer graph"):
     122 + self.G_undir.add_edge(per.er_map[source_mid], f"c|{target_eid}", edge_type="control")
     123 + 
     124 + def get_neigh(self, nodes, radius=2):
     125 + nodes = set([n for n in nodes if n in self.G_undir])
     126 + for node in list(nodes):
     127 + neighbors = set(self.G_undir.neighbors(node))
     128 + neighbors = set([x for x in neighbors if isinstance(x, str)])
     129 + nodes = nodes.union(neighbors)
     130 + return nodes if radius == 1 else self.get_neigh(nodes, radius - 1)
     131 + 
     132 + def build(self, per, psc_company_df, psc_person_df, officer_df):
     133 + self.build_G(per, psc_company_df, psc_person_df, officer_df)
     134 + self.CCs = sorted(nx.connected_components(self.G_undir), key=len, reverse=True)
     135 + self.ccs_len = [len(c) for c in self.CCs]
     136 + print(f"Top 10 Connected component sizes: {self.ccs_len[:10]}")
     137 + 
     138 + @staticmethod
     139 + def hash_subnetwork(nodes: iter):
     140 + return sum(hash(n) % 2 ** 32 for n in nodes)
     141 + 
     142 + def break_into_subgraphs(self, max_size=1000):
     143 + for cc in self.CCs:
     144 + # Small connected components go in as they are:
     145 + if len(cc) < max_size:
     146 + subnetwork_hash = self.hash_subnetwork(cc)
     147 + self.hash_to_subn_map[subnetwork_hash] = cc
     148 + # Giant component gets added by unique neighbourhoods
     149 + else:
     150 + size_before = len(self.hash_to_subn_map)
     151 + extra_nodes = 0
     152 + for node in tqdm(cc, desc=f"Breaking down Giant CC (size {len(cc)})"):
     153 + subnetwork = self.get_neigh([node])
     154 + subnetwork_hash = self.hash_subnetwork(subnetwork)
     155 + if subnetwork_hash not in self.hash_to_subn_map:
     156 + self.hash_to_subn_map[subnetwork_hash] = subnetwork
     157 + extra_nodes += len(subnetwork)
     158 + print(f"""Giant Component of size {len(cc)} was broken down.
     159 + Added {len(self.hash_to_subn_map) - size_before} neighbourhoods,
     160 + Sum of all nodes = {extra_nodes}
     161 + Overhead ratio={extra_nodes / len(cc)}
     162 +""")
     163 + 
     164 + 
     165 +class NodeDescriber:
     166 + def __init__(self, per):
     167 + self.per = per
     168 + self.node_to_jurs = defaultdict(set)
     169 + self.node_to_names = {}
     170 + self.node_to_metadata = {}
     171 + self.metadata_cols = {
     172 + "company": ["mention_id", "name", "industry_code", "company_number", "country", "address"],
     173 + "psc_person": ["mention_id", "name", "nationality", "address_postal_code", "date_of_birth_year",
     174 + "date_of_birth_month"],
     175 + "psc_company": ["mention_id", "name", "psc_derived_company_number", "combined_address", "address_country",
     176 + "kind"],
     177 + "officer_person": ["mention_id", "forenames", "surname", "nationality", "yob", "mob"],
     178 + "officer_company": ["mention_id", "surname", "country", 'appointment_role', 'post_town', 'postal_code'],
     179 + }
     180 + 
     181 + def unify_jur(self, c):
     182 + if c is None:
     183 + return None
     184 + if c in ['england', 'wales', 'northern ireland', 'scotland', "uk"]:
     185 + return 'united kingdom'
     186 + if c in ["usa", "united states", "united states of america"]:
     187 + return "united states"
     188 + if c in ["british virgin islands", "virgin islands, british", "virgin islands"]:
     189 + return "virgin islands, british"
     190 + return c
     191 + 
     192 + def add_node_to_names(self):
     193 + for eid in set(self.per.er_map.values()):
     194 + if eid.startswith("p|"):
     195 + self.node_to_names[eid] = eid.split("|")[1]
     196 + 
     197 + def add_dataset_metadata(self, df, dataset_name):
     198 + c = self.metadata_cols[dataset_name]
     199 + inp = zip(df[c[0]], df[c[1]], df[c[2]], df[c[3]], df[c[4]], df[c[5]])
     200 + for tpl in tqdm(inp, desc=dataset_name, total=df.shape[0]):
     201 + eid = self.per.er_map[tpl[0]]
     202 + if eid not in self.node_to_metadata:
     203 + self.node_to_metadata[eid] = {}
     204 + for col_inx in range(1, len(c)):
     205 + col = self.metadata_cols[dataset_name][col_inx]
     206 + self.node_to_metadata[eid][col] = tpl[col_inx]
     207 + if col in ["country", "address_country", "reg_address_country"]:
     208 + if isinstance(tpl[col_inx], str):
     209 + unified_jur = self.unify_jur(tpl[col_inx].lower())
     210 + self.node_to_jurs[eid].add(unified_jur)
     211 + 
     212 + def add_metadata(self, company_df, officer_df, psc_company_df, psc_person_df):
     213 + self.add_dataset_metadata(company_df, "company")
     214 + self.add_dataset_metadata(officer_df[~officer_df.is_corporate_body], "officer_person")
     215 + self.add_dataset_metadata(officer_df[officer_df.is_corporate_body], "officer_company")
     216 + self.add_dataset_metadata(psc_person_df, "psc_person")
     217 + self.add_dataset_metadata(psc_company_df, "psc_company")
     218 + self.add_node_to_names()
     219 + 
Please wait...
Page is in error, reload to recover