1 1 import streamlit as st 2 2 from streamlit_agraph import Node, Edge 3 - import json 3 + 4 4 import pandas as pd 5 5 6 - NODE_COLOUR_NON_DODGY = "#72EF77 " 7 - NODE_COLOUR_DODGY = "#F63333 " 8 - NODE_IMAGE_PERSON = "http ://i .ibb .co /LrY3tfw /747376 .png" # https://www.flaticon.com/free-icon/user_747376 9 - NODE_IMAGE_COMPANY = "http ://i .ibb .co /fx6r1dZ /4812244 .png" # https://www.flaticon.com/free-icon/company_4812244 6 + NODE_COLOUR_NON_DODGY = "#D1E2F8 " 7 + NODE_COLOUR_DODGY = "#F99292 " 8 + NODE_IMAGE_PERSON = "https ://raw .githubusercontent .com /sahanmar / 451 / main /data / person_icon .png" # https://www.flaticon.com/free-icon/user_747376 9 + NODE_IMAGE_COMPANY = "https ://raw .githubusercontent .com /sahanmar / 451 / main /data / company_icon .png" # https://www.flaticon.com/free-icon/company_4812244 10 10 11 11 12 12 @st.cache() 13 13 def get_subgraph_df(): 14 - return pd.read_parquet("./data/network.parquet", engine="pyarrow").set_index( 15 - "network_id" 16 - ) 14 + subgraphs = pd.read_parquet( 15 + "./data/1m_networks.parquet", engine="pyarrow" 16 + ).set_index("network_id") 17 + return subgraphs 17 18 18 19 19 20 @st.cache() 20 21 def get_subgraph_nodes_df(subgraph_hash): 21 - return pd.read_parquet( 22 - "./data/nodes .parquet", 22 + nodes = pd.read_parquet( 23 + "./data/1m_nodes .parquet", 23 24 filters=[[("subgraph_hash", "=", subgraph_hash)]], 24 25 engine="pyarrow", 25 26 ) 27 + 28 + return nodes 26 29 27 30 28 31 @st.cache() 29 32 def get_subgraph_edges_df(subgraph_hash): 30 33 return pd.read_parquet( 31 - "./data/edges .parquet", 34 + "./data/1m_edges .parquet", 32 35 filters=[[("subgraph_hash", "=", subgraph_hash)]], 33 36 engine="pyarrow", 34 37 ) 35 38 36 39 40 + @st.cache() 37 41 def get_subgraph_with_risk_score( 38 42 subgraph_table, 39 - weight_chains, 40 43 weight_cyclic, 41 - weight_psc_haven, 44 + weight_company_ratio, 45 + weight_proxy_directors, 46 + weight_multi_jurisdiction, 47 + weight_tax_havens, 42 48 weight_pep, 43 - weight_sanctions, 44 - weight_disqualified, 49 + weight_russian_pep, 45 50 ): 46 51 47 52 out = subgraph_table.copy() 48 - out["total_risk"] = out["cyclicity"] * weight_cyclic / out["cyclicity"].max() 49 - return out.sort_values(by="total_risk", ascending=False) 53 + out["total_risk"] = ( 54 + (out["cyclicity"] * weight_cyclic / out["cyclicity"].max()) 55 + + (out["company_ratio"] * weight_company_ratio / out["company_ratio"].max()) 56 + + ( 57 + out["multi_jurisdiction"] 58 + * weight_multi_jurisdiction 59 + / out["multi_jurisdiction"].max() 60 + ) 61 + + (out["tax_haven"] * weight_tax_havens / out["tax_haven"].max()) 62 + + (out["proxy"] * weight_proxy_directors / out["proxy"].max()) 63 + + (out["potential_pep_match"] * weight_pep / out["potential_pep_match"].max()) 64 + + ( 65 + out["potential_rus_pep_match"] 66 + * weight_russian_pep 67 + / out["potential_rus_pep_match"].max() 68 + ) 69 + ) 70 + return out.sort_values(by="total_risk", ascending=False).query("total_risk > 0") 50 71 51 72 52 73 def build_agraph_components( skipped 6 lines 59 80 edge_objects = [] 60 81 61 82 for _, row in nodes.iterrows(): 62 - # node_metadata = json.loads(row["node_metadata"]) 83 + 84 + node_features = parse_node_features(row) 85 + 63 86 node_objects.append( 64 87 Node( 65 88 id=row["node_id"], 66 - label="\n".join(row["node_id"].split("|")[0].split(" ")), 89 + label=node_features["name"], 67 90 size=20, 68 - # color=NODE_COLOUR_DODGY 69 - # if (row["pep"] > 0 or row["sanction"] > 0) 70 - # else NODE_COLOUR_NON_DODGY, 71 - image=NODE_IMAGE_PERSON, 72 - # if row ["is_person"] = = 1 73 - # else NODE_IMAGE_COMPANY, 91 + color=NODE_COLOUR_DODGY 92 + if node_features["is_dodgy"] 93 + else NODE_COLOUR_NON_DODGY, 94 + image=NODE_IMAGE_PERSON 95 + if node_features ["is_person"] 96 + else NODE_IMAGE_COMPANY, 74 97 shape="circularImage", 75 98 ) 76 99 ) skipped 2 lines 79 102 edge_objects.append( 80 103 Edge( 81 104 source=row["source"], 82 - # label=row["type"][0], 105 + label=row["type"][0], 83 106 target=row["target"], 107 + smooth=True, 84 108 ) 85 109 ) 86 110 87 111 return (node_objects, edge_objects) 88 112 89 113 114 + @st.cache() 90 115 def build_markdown_strings_for_node(nodes_selected): 91 116 """Separate into People and Company strings""" 92 117 skipped 2 lines 95 120 markdown_strings["people"] = [] 96 121 97 122 for _, row in nodes_selected.iterrows(): 98 - node_metadata = { 99 - "name": row["node_id"], 100 - "is_proxy": row["proxy_dir"], 101 - "is_person": True, 102 - } 103 - 104 - # node_metadata = json.loads(row["node_metadata"]) 105 - # node_sanctions = ( 106 - # "" if row["sanction"] == 0 else f"! SANCTIONED: {row['sanction_metadata']}" 107 - # ) 108 - # node_pep = "" if row["pep"] == 0 else f"! PEP: {row['pep_metadata']}" 109 123 110 - node_sanctions = "" 111 - node_pep = "" 124 + node_features = parse_node_features(row) 112 125 113 - if node_metadata ["is_person"]: 114 - # node_title = f"{node_metadata['name']} [{node_metadata['nationality']}/{node_metadata['yob']}/{node_metadata['mob']}]" 115 - node_title = f"{node_metadata['name']}" 126 + if node_features ["is_person"]: 127 + text = f"{node_features['name']} [{node_features['nationality']}/{node_features['yob']}/{node_features['mob']}]" 116 128 key = "people" 117 129 else: 118 - # node_title = f"{node_metadata['name']} [{row['jur']}/{node_metadata['reg']}/{node_metadata['address']}]" 119 - node_title = f"{node_metadata['name']}" 130 + text = f"{node_features['name']} [{node_features['company_number']}/{node_features['country']}]" 120 131 key = "companies" 121 132 122 - markdown_strings[key].append( 123 - "\n".join( 124 - [x for x in ["```", node_title, node_pep, node_sanctions] if len(x) > 0] 125 - ) 126 - ) 133 + if node_features["is_proxy"]: 134 + text += "\n !! Proxy director !!" 135 + 136 + if node_features["is_pep"]: 137 + text += "\n !! Politically-exposed person !!" 138 + text += f"\n -> {row['politician_metadata']}" 139 + 140 + if node_features["is_rus_pep"]: 141 + text += "\n !! Russian politically-exposed person !!" 142 + text += f"\n -> {row['rus_politician_metadata']}" 143 + 144 + markdown_strings[key].append(f"```\n{text}") 127 145 128 146 return markdown_strings 129 147 148 + 149 + @st.cache() 150 + def parse_node_features(row): 151 + out = dict() 152 + 153 + out["is_person"] = row["is_person"] == 1 154 + out["is_proxy"] = row["proxy_dir"] == 1 155 + out["is_pep"] = row["politician"] == 1 156 + out["is_rus_pep"] = row["rus_politician"] == 1 157 + out["is_dodgy"] = out["is_proxy"] or out["is_pep"] or out["is_rus_pep"] 158 + 159 + if row["node_metadata"]: 160 + node_metadata = row["node_metadata"] 161 + else: 162 + node_metadata = {"forenames": None, "surname": None, "name": None} 163 + 164 + if out["is_person"]: 165 + if node_metadata["forenames"] and node_metadata["surname"]: 166 + out["name"] = node_metadata["forenames"] + " " + node_metadata["surname"] 167 + else: 168 + out["name"] = row["node_id"] 169 + else: 170 + if node_metadata["name"]: 171 + out["name"] = node_metadata["name"] 172 + else: 173 + out["name"] = node_metadata["surname"] 174 + 175 + raw_metadata = row["node_metadata"] if row["node_metadata"] else {} 176 + raw_metadata = {k: v for k, v in raw_metadata.items() if v is not None} 177 + 178 + out["address"] = raw_metadata.get("address", "") 179 + out["country"] = raw_metadata.get("country", "") 180 + out["yob"] = raw_metadata.get("yob", "") 181 + out["mob"] = raw_metadata.get("mob", "") 182 + out["nationality"] = raw_metadata.get("nationality", "") 183 + out["company_number"] = raw_metadata.get("company_number", "") 184 + 185 + return out 186 +