1 | 1 | | import streamlit as st |
2 | 2 | | from streamlit_agraph import Node, Edge |
3 | | - | import json |
| 3 | + | |
4 | 4 | | import pandas as pd |
5 | 5 | | |
6 | | - | NODE_COLOUR_NON_DODGY = "#72EF77" |
7 | | - | NODE_COLOUR_DODGY = "#F63333" |
| 6 | + | NODE_COLOUR_NON_DODGY = "#D1E2F8" |
| 7 | + | NODE_COLOUR_DODGY = "#F99292" |
8 | 8 | | NODE_IMAGE_PERSON = "http://i.ibb.co/LrY3tfw/747376.png" # https://www.flaticon.com/free-icon/user_747376 |
9 | 9 | | NODE_IMAGE_COMPANY = "http://i.ibb.co/fx6r1dZ/4812244.png" # https://www.flaticon.com/free-icon/company_4812244 |
10 | 10 | | |
11 | 11 | | |
12 | 12 | | @st.cache() |
13 | 13 | | def get_subgraph_df(): |
14 | | - | return pd.read_parquet("./data/network.parquet", engine="pyarrow").set_index( |
15 | | - | "network_id" |
16 | | - | ) |
| 14 | + | subgraphs = pd.read_parquet( |
| 15 | + | "./data/1m_networks.parquet", engine="pyarrow" |
| 16 | + | ).set_index("network_id") |
| 17 | + | return subgraphs |
17 | 18 | | |
18 | 19 | | |
19 | 20 | | @st.cache() |
20 | 21 | | def get_subgraph_nodes_df(subgraph_hash): |
21 | | - | return pd.read_parquet( |
22 | | - | "./data/nodes.parquet", |
| 22 | + | nodes = pd.read_parquet( |
| 23 | + | "./data/1m_nodes.parquet", |
23 | 24 | | filters=[[("subgraph_hash", "=", subgraph_hash)]], |
24 | 25 | | engine="pyarrow", |
25 | 26 | | ) |
| 27 | + | |
| 28 | + | return nodes |
26 | 29 | | |
27 | 30 | | |
28 | 31 | | @st.cache() |
29 | 32 | | def get_subgraph_edges_df(subgraph_hash): |
30 | 33 | | return pd.read_parquet( |
31 | | - | "./data/edges.parquet", |
| 34 | + | "./data/1m_edges.parquet", |
32 | 35 | | filters=[[("subgraph_hash", "=", subgraph_hash)]], |
33 | 36 | | engine="pyarrow", |
34 | 37 | | ) |
35 | 38 | | |
36 | 39 | | |
| 40 | + | @st.cache() |
37 | 41 | | def get_subgraph_with_risk_score( |
38 | 42 | | subgraph_table, |
39 | | - | weight_chains, |
40 | 43 | | weight_cyclic, |
41 | | - | weight_psc_haven, |
| 44 | + | weight_company_ratio, |
| 45 | + | weight_proxy_directors, |
| 46 | + | weight_multi_jurisdiction, |
| 47 | + | weight_tax_havens, |
42 | 48 | | weight_pep, |
43 | | - | weight_sanctions, |
44 | | - | weight_disqualified, |
| 49 | + | weight_russian_pep, |
45 | 50 | | ): |
46 | 51 | | |
47 | 52 | | out = subgraph_table.copy() |
48 | | - | out["total_risk"] = out["cyclicity"] * weight_cyclic / out["cyclicity"].max() |
49 | | - | return out.sort_values(by="total_risk", ascending=False) |
| 53 | + | out["total_risk"] = ( |
| 54 | + | (out["cyclicity"] * weight_cyclic / out["cyclicity"].max()) |
| 55 | + | + (out["company_ratio"] * weight_company_ratio / out["company_ratio"].max()) |
| 56 | + | + ( |
| 57 | + | out["multi_jurisdiction"] |
| 58 | + | * weight_multi_jurisdiction |
| 59 | + | / out["multi_jurisdiction"].max() |
| 60 | + | ) |
| 61 | + | + (out["tax_haven"] * weight_tax_havens / out["tax_haven"].max()) |
| 62 | + | + (out["proxy"] * weight_proxy_directors / out["proxy"].max()) |
| 63 | + | + (out["potential_pep_match"] * weight_pep / out["potential_pep_match"].max()) |
| 64 | + | + ( |
| 65 | + | out["potential_rus_pep_match"] |
| 66 | + | * weight_russian_pep |
| 67 | + | / out["potential_rus_pep_match"].max() |
| 68 | + | ) |
| 69 | + | ) |
| 70 | + | return out.sort_values(by="total_risk", ascending=False).query("total_risk > 0") |
50 | 71 | | |
51 | 72 | | |
52 | 73 | | def build_agraph_components( |
| skipped 6 lines |
59 | 80 | | edge_objects = [] |
60 | 81 | | |
61 | 82 | | for _, row in nodes.iterrows(): |
62 | | - | # node_metadata = json.loads(row["node_metadata"]) |
| 83 | + | |
| 84 | + | node_features = parse_node_features(row) |
| 85 | + | |
63 | 86 | | node_objects.append( |
64 | 87 | | Node( |
65 | 88 | | id=row["node_id"], |
66 | | - | label="\n".join(row["node_id"].split("|")[0].split(" ")), |
| 89 | + | label=node_features["name"], |
67 | 90 | | size=20, |
68 | | - | # color=NODE_COLOUR_DODGY |
69 | | - | # if (row["pep"] > 0 or row["sanction"] > 0) |
70 | | - | # else NODE_COLOUR_NON_DODGY, |
71 | | - | image=NODE_IMAGE_PERSON, |
72 | | - | # if row["is_person"] == 1 |
73 | | - | # else NODE_IMAGE_COMPANY, |
| 91 | + | color=NODE_COLOUR_DODGY |
| 92 | + | if node_features["is_dodgy"] |
| 93 | + | else NODE_COLOUR_NON_DODGY, |
| 94 | + | image=NODE_IMAGE_PERSON |
| 95 | + | if node_features["is_person"] |
| 96 | + | else NODE_IMAGE_COMPANY, |
74 | 97 | | shape="circularImage", |
75 | 98 | | ) |
76 | 99 | | ) |
| skipped 2 lines |
79 | 102 | | edge_objects.append( |
80 | 103 | | Edge( |
81 | 104 | | source=row["source"], |
82 | | - | # label=row["type"][0], |
| 105 | + | label=row["type"][0], |
83 | 106 | | target=row["target"], |
| 107 | + | smooth=True, |
84 | 108 | | ) |
85 | 109 | | ) |
86 | 110 | | |
87 | 111 | | return (node_objects, edge_objects) |
88 | 112 | | |
89 | 113 | | |
| 114 | + | @st.cache() |
90 | 115 | | def build_markdown_strings_for_node(nodes_selected): |
91 | 116 | | """Separate into People and Company strings""" |
92 | 117 | | |
| skipped 2 lines |
95 | 120 | | markdown_strings["people"] = [] |
96 | 121 | | |
97 | 122 | | for _, row in nodes_selected.iterrows(): |
98 | | - | node_metadata = { |
99 | | - | "name": row["node_id"], |
100 | | - | "is_proxy": row["proxy_dir"], |
101 | | - | "is_person": True, |
102 | | - | } |
103 | 123 | | |
104 | | - | # node_metadata = json.loads(row["node_metadata"]) |
105 | | - | # node_sanctions = ( |
106 | | - | # "" if row["sanction"] == 0 else f"! SANCTIONED: {row['sanction_metadata']}" |
107 | | - | # ) |
108 | | - | # node_pep = "" if row["pep"] == 0 else f"! PEP: {row['pep_metadata']}" |
| 124 | + | node_features = parse_node_features(row) |
109 | 125 | | |
110 | | - | node_sanctions = "" |
111 | | - | node_pep = "" |
112 | | - | |
113 | | - | if node_metadata["is_person"]: |
114 | | - | # node_title = f"{node_metadata['name']} [{node_metadata['nationality']}/{node_metadata['yob']}/{node_metadata['mob']}]" |
115 | | - | node_title = f"{node_metadata['name']}" |
| 126 | + | if node_features["is_person"]: |
| 127 | + | text = f"{node_features['name']} [{node_features['nationality']}/{node_features['yob']}/{node_features['mob']}]" |
116 | 128 | | key = "people" |
117 | 129 | | else: |
118 | | - | # node_title = f"{node_metadata['name']} [{row['jur']}/{node_metadata['reg']}/{node_metadata['address']}]" |
119 | | - | node_title = f"{node_metadata['name']}" |
| 130 | + | text = f"{node_features['name']} [{node_features['company_number']}/{node_features['country']}]" |
120 | 131 | | key = "companies" |
121 | 132 | | |
122 | | - | markdown_strings[key].append( |
123 | | - | "\n".join( |
124 | | - | [x for x in ["```", node_title, node_pep, node_sanctions] if len(x) > 0] |
125 | | - | ) |
126 | | - | ) |
| 133 | + | if node_features["is_proxy"]: |
| 134 | + | text += "\n !! Proxy director !!" |
| 135 | + | |
| 136 | + | if node_features["is_pep"]: |
| 137 | + | text += "\n !! Politically-exposed person !!" |
| 138 | + | text += f"\n -> {row['politician_metadata']}" |
| 139 | + | |
| 140 | + | if node_features["is_rus_pep"]: |
| 141 | + | text += "\n !! Russian politically-exposed person !!" |
| 142 | + | text += f"\n -> {row['rus_politician_metadata']}" |
| 143 | + | |
| 144 | + | markdown_strings[key].append(f"```\n{text}") |
127 | 145 | | |
128 | 146 | | return markdown_strings |
129 | 147 | | |
| 148 | + | |
| 149 | + | @st.cache() |
| 150 | + | def parse_node_features(row): |
| 151 | + | out = dict() |
| 152 | + | |
| 153 | + | out["is_person"] = row["is_person"] == 1 |
| 154 | + | out["is_proxy"] = row["proxy_dir"] == 1 |
| 155 | + | out["is_pep"] = row["politician"] == 1 |
| 156 | + | out["is_rus_pep"] = row["rus_politician"] == 1 |
| 157 | + | out["is_dodgy"] = out["is_proxy"] or out["is_pep"] or out["is_rus_pep"] |
| 158 | + | |
| 159 | + | if row["node_metadata"]: |
| 160 | + | node_metadata = row["node_metadata"] |
| 161 | + | else: |
| 162 | + | node_metadata = {"forenames": None, "surname": None, "name": None} |
| 163 | + | |
| 164 | + | if out["is_person"]: |
| 165 | + | if node_metadata["forenames"] and node_metadata["surname"]: |
| 166 | + | out["name"] = node_metadata["forenames"] + " " + node_metadata["surname"] |
| 167 | + | else: |
| 168 | + | out["name"] = row["node_id"] |
| 169 | + | else: |
| 170 | + | if node_metadata["name"]: |
| 171 | + | out["name"] = node_metadata["name"] |
| 172 | + | else: |
| 173 | + | out["name"] = node_metadata["surname"] |
| 174 | + | |
| 175 | + | raw_metadata = row["node_metadata"] if row["node_metadata"] else {} |
| 176 | + | raw_metadata = {k: v for k, v in raw_metadata.items() if v is not None} |
| 177 | + | |
| 178 | + | out["address"] = raw_metadata.get("address", "") |
| 179 | + | out["country"] = raw_metadata.get("country", "") |
| 180 | + | out["yob"] = raw_metadata.get("yob", "") |
| 181 | + | out["mob"] = raw_metadata.get("mob", "") |
| 182 | + | out["nationality"] = raw_metadata.get("nationality", "") |
| 183 | + | out["company_number"] = raw_metadata.get("company_number", "") |
| 184 | + | |
| 185 | + | return out |
| 186 | + | |