From a282d62fca78d5793fdb3ba262d118c304b99b34 Mon Sep 17 00:00:00 2001 From: Feng <33390790+fengxie009@users.noreply.github.com> Date: Wed, 7 Dec 2022 10:24:34 +0800 Subject: [PATCH 01/14] Delete GIN.py --- causallearn/search/HiddenCausal/GIN/GIN.py | 357 --------------------- 1 file changed, 357 deletions(-) delete mode 100644 causallearn/search/HiddenCausal/GIN/GIN.py diff --git a/causallearn/search/HiddenCausal/GIN/GIN.py b/causallearn/search/HiddenCausal/GIN/GIN.py deleted file mode 100644 index 17f31b20..00000000 --- a/causallearn/search/HiddenCausal/GIN/GIN.py +++ /dev/null @@ -1,357 +0,0 @@ -from collections import deque -from itertools import combinations - -import numpy as np -from scipy.stats import chi2 - -from causallearn.graph.GeneralGraph import GeneralGraph -from causallearn.graph.GraphNode import GraphNode -from causallearn.graph.NodeType import NodeType -from causallearn.graph.Edge import Edge -from causallearn.graph.Endpoint import Endpoint -from causallearn.search.FCMBased.lingam.hsic import hsic_test_gamma -from causallearn.utils.KCI.KCI import KCI_UInd - - -def fisher_test(pvals): - pvals = [pval if pval >= 1e-5 else 1e-5 for pval in pvals] - fisher_stat = -2.0 * np.sum(np.log(pvals)) - return 1 - chi2.cdf(fisher_stat, 2 * len(pvals)) - - -def GIN(data, indep_test_method='kci', alpha=0.05): - ''' - Learning causal structure of Latent Variables for Linear Non-Gaussian Latent Variable Model - with Generalized Independent Noise Condition - Parameters - ---------- - data : numpy ndarray - data set - indep_test_method : str, default='kci' - the name of the independence test being used - alpha : float, default=0.05 - desired significance level of independence tests (p_value) in (0,1) - Returns - ------- - G : general graph - causal graph - causal_order : list - causal order - ''' - n = data.shape[1] - cov = np.cov(data.T) - - if indep_test_method == 'kci': - kci = KCI_UInd() - - if indep_test_method not in ['kci', 'hsic']: - raise NotImplementedError((f"Independent test method {indep_test_method} is not implemented.")) - - def indep_test(x, y, method): - if method == 'kci': - return kci.compute_pvalue(x, y)[0] - elif method == 'hsic': - return hsic_test_gamma(x, y)[1] - else: - raise NotImplementedError((f"Independent test method {indep_test_method} is not implemented.")) - - var_set = set(range(n)) - cluster_size = 2 - clusters_list = [] - while cluster_size < len(var_set): - tmp_clusters_list = [] - for cluster in combinations(var_set, cluster_size): - remain_var_set = var_set - set(cluster) - e = cal_e_with_gin(data, cov, list(cluster), list(remain_var_set)) - pvals = [] - for z in range(len(remain_var_set)): - pvals.append(indep_test(data[:, [z]], e[:, None], method=indep_test_method)) - fisher_pval = fisher_test(pvals) - if fisher_pval >= alpha: - tmp_clusters_list.append(cluster) - tmp_clusters_list = merge_overlaping_cluster(tmp_clusters_list) - clusters_list = clusters_list + tmp_clusters_list - for cluster in tmp_clusters_list: - var_set -= set(cluster) - cluster_size += 1 - - causal_order = [] # this variable corresponds to K in paper - updated = True - while updated: - updated = False - X = [] - Z = [] - for cluster_k in causal_order: - cluster_k1, cluster_k2 = array_split(cluster_k, 2) - X += cluster_k1 - Z += cluster_k2 - - for i, cluster_i in enumerate(clusters_list): - is_root = True - cluster_i1, cluster_i2 = array_split(cluster_i, 2) - for j, cluster_j in enumerate(clusters_list): - if i == j: - continue - cluster_j1, cluster_j2 = array_split(cluster_j, 2) - e = cal_e_with_gin(data, cov, X + cluster_i1 + cluster_j1, Z + cluster_i2) - pvals = [] - for z in range(len(Z + cluster_i2)): - pvals.append(indep_test(data[:, [z]], e[:, None], method=indep_test_method)) - fisher_pval = fisher_test(pvals) - if fisher_pval < alpha: - is_root = False - break - if is_root: - causal_order.append(cluster_i) - clusters_list.remove(cluster_i) - updated = True - break - - G = GeneralGraph([]) - for var in var_set: - o_node = GraphNode(f"X{var + 1}") - G.add_node(o_node) - - latent_id = 1 - l_nodes = [] - - for cluster in causal_order: - l_node = GraphNode(f"L{latent_id}") - l_node.set_node_type(NodeType.LATENT) - G.add_node(l_node) - for l in l_nodes: - G.add_directed_edge(l, l_node) - l_nodes.append(l_node) - - for o in cluster: - o_node = GraphNode(f"X{o + 1}") - G.add_node(o_node) - G.add_directed_edge(l_node, o_node) - latent_id += 1 - - undirected_l_nodes = [] - - for cluster in clusters_list: - l_node = GraphNode(f"L{latent_id}") - l_node.set_node_type(NodeType.LATENT) - G.add_node(l_node) - for l in l_nodes: - G.add_directed_edge(l, l_node) - - for l in undirected_l_nodes: - G.add_edge(Edge(l, l_node, Endpoint.TAIL, Endpoint.TAIL)) - - undirected_l_nodes.append(l_node) - - for o in cluster: - o_node = GraphNode(f"X{o + 1}") - G.add_node(o_node) - G.add_directed_edge(l_node, o_node) - latent_id += 1 - - return G, causal_order - - -def GIN_MI(data): - ''' - Learning causal structure of Latent Variables for Linear Non-Gaussian Latent Variable Model - with Generalized Independent Noise Condition - - Parameters - ---------- - data : numpy ndarray - data set - - Returns - ------- - G : general graph - causal graph - causal_order : list - causal order - ''' - v_labels = list(range(data.shape[1])) - v_set = set(v_labels) - cov = np.cov(data.T) - - # Step 1: Finding Causal Clusters - cluster_list = [] - min_cluster = {i: set() for i in v_set} - min_dep_score = {i: 1e9 for i in v_set} - for (x1, x2) in combinations(v_set, 2): - x_set = {x1, x2} - z_set = v_set - x_set - dep_statistic = cal_dep_for_gin(data, cov, list(x_set), list(z_set)) - for i in x_set: - if min_dep_score[i] > dep_statistic: - min_dep_score[i] = dep_statistic - min_cluster[i] = x_set - for i in v_labels: - cluster_list.append(list(min_cluster[i])) - - cluster_list = merge_overlaping_cluster(cluster_list) - - # Step 2: Learning the Causal Order of Latent Variables - causal_order = [] # this variable corresponds to K in paper - while (len(cluster_list) != 0): - root = find_root(data, cov, cluster_list, causal_order) - causal_order.append(root) - cluster_list.remove(root) - - latent_id = 1 - l_nodes = [] - G = GeneralGraph([]) - for cluster in causal_order: - l_node = GraphNode(f"L{latent_id}") - l_node.set_node_type(NodeType.LATENT) - l_nodes.append(l_node) - G.add_node(l_node) - for l in l_nodes: - if l != l_node: - G.add_directed_edge(l, l_node) - for o in cluster: - o_node = GraphNode(f"X{o + 1}") - G.add_node(o_node) - G.add_directed_edge(l_node, o_node) - latent_id += 1 - - return G, causal_order - - -def cal_e_with_gin(data, cov, X, Z): - cov_m = cov[np.ix_(Z, X)] - _, _, v = np.linalg.svd(cov_m) - omega = v.T[:, -1] - return np.dot(data[:, X], omega) - - -def cal_dep_for_gin(data, cov, X, Z): - ''' - Calculate the statistics of dependence via Generalized Independent Noise Condition - - Parameters - ---------- - data : data set (numpy ndarray) - cov : covariance matrix - X : test set variables - Z : condition set variables - - Returns - ------- - sta : test statistic - ''' - - e_xz = cal_e_with_gin(data, cov, X, Z) - - sta = 0 - for i in Z: - sta += hsic_test_gamma(e_xz, data[:, i])[0] - sta /= len(Z) - return sta - - -def find_root(data, cov, clusters, causal_order): - ''' - Find the causal order by statistics of dependence - Parameters - ---------- - data : data set (numpy ndarray) - cov : covariance matrix - clusters : clusters of observed variables - causal_order : causal order - Returns - ------- - root : latent root cause - ''' - if len(clusters) == 1: - return clusters[0] - root = clusters[0] - dep_statistic_score = 1e30 - for i in clusters: - for j in clusters: - if i == j: - continue - X = [i[0], j[0]] - Z = [] - for k in range(1, len(i)): - Z.append(i[k]) - - if causal_order: - for k in causal_order: - X.append(k[0]) - Z.append(k[1]) - - dep_statistic = cal_dep_for_gin(data, cov, X, Z) - if dep_statistic < dep_statistic_score: - dep_statistic_score = dep_statistic - root = i - - return root - - -def _get_all_elements(S): - result = set() - for i in S: - for j in i: - result |= {j} - return result - - -# merging cluster -def merge_overlaping_cluster(cluster_list): - v_labels = _get_all_elements(cluster_list) - if len(v_labels) == 0: - return [] - cluster_dict = {i: -1 for i in v_labels} - cluster_b = {i: [] for i in v_labels} - cluster_len = 0 - for i in range(len(cluster_list)): - for j in cluster_list[i]: - cluster_b[j].append(i) - - visited = [False] * len(cluster_list) - cont = True - while cont: - cont = False - q = deque() - for i, val in enumerate(visited): - if not val: - q.append(i) - visited[i] = True - break - while q: - top = q.popleft() - for i in cluster_list[top]: - cluster_dict[i] = cluster_len - for j in cluster_b[i]: - if not visited[j]: - q.append(j) - visited[j] = True - - for i in visited: - if not i: - cont = True - break - cluster_len += 1 - - cluster = [[] for _ in range(cluster_len)] - for i in v_labels: - cluster[cluster_dict[i]].append(i) - - return cluster - - -def array_split(x, k): - x_len = len(x) - # div_points = [] - sub_arys = [] - start = 0 - section_len = x_len // k - extra = x_len % k - for i in range(extra): - sub_arys.append(x[start:start + section_len + 1]) - start = start + section_len + 1 - - for i in range(k - extra): - sub_arys.append(x[start:start + section_len]) - start = start + section_len - return sub_arys \ No newline at end of file From 89b80f1c8952cfcba73300544c0ff57a82e405e9 Mon Sep 17 00:00:00 2001 From: Feng <33390790+fengxie009@users.noreply.github.com> Date: Wed, 7 Dec 2022 10:25:08 +0800 Subject: [PATCH 02/14] Add files via upload --- .../search/HiddenCausal/GIN/FisherTest.py | 45 +++ causallearn/search/HiddenCausal/GIN/GIN.py | 68 +++++ causallearn/search/HiddenCausal/GIN/GIN2.py | 94 ++++++ causallearn/search/HiddenCausal/GIN/HSIC2.py | 101 +++++++ .../GIN/Identifying_Causal_Clusters.py | 70 +++++ .../HiddenCausal/GIN/Infer_Causal_Order.py | 70 +++++ .../search/HiddenCausal/GIN/Utils_V2.py | 127 ++++++++ .../search/HiddenCausal/GIN/independence.py | 38 +++ causallearn/search/HiddenCausal/GIN/mutual.py | 276 ++++++++++++++++++ 9 files changed, 889 insertions(+) create mode 100644 causallearn/search/HiddenCausal/GIN/FisherTest.py create mode 100644 causallearn/search/HiddenCausal/GIN/GIN.py create mode 100644 causallearn/search/HiddenCausal/GIN/GIN2.py create mode 100644 causallearn/search/HiddenCausal/GIN/HSIC2.py create mode 100644 causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py create mode 100644 causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py create mode 100644 causallearn/search/HiddenCausal/GIN/Utils_V2.py create mode 100644 causallearn/search/HiddenCausal/GIN/independence.py create mode 100644 causallearn/search/HiddenCausal/GIN/mutual.py diff --git a/causallearn/search/HiddenCausal/GIN/FisherTest.py b/causallearn/search/HiddenCausal/GIN/FisherTest.py new file mode 100644 index 00000000..0e1de1b8 --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/FisherTest.py @@ -0,0 +1,45 @@ +#------------------------------------------------------------------------------- +# Name: 模块1 +# Purpose: +# +# Author: YY +# +# Created: 03/03/2021 +# Copyright: (c) YY 2021 +# Licence: +#------------------------------------------------------------------------------- +import math +from scipy.stats import chi2 + +def FisherTest(pvals,alph=0.01): + Fisher_Stat=0 + L = len(pvals) + for i in range(0,L): + if pvals[i] ==0: + TP = 1e-05 + else: + TP = pvals[i] + + Fisher_Stat = Fisher_Stat-2*math.log(TP) + + + Fisher_pval = 1-chi2.cdf(Fisher_Stat, 2*L) #自由度2*L + + #print(Fisher_pval) + + if Fisher_pval >alph: + return True,Fisher_pval + else: + return False,Fisher_pval + + + + + + +def main(): + pvals = [0.01,0.9] + FisherTest(pvals,0.1) + +if __name__ == '__main__': + main() diff --git a/causallearn/search/HiddenCausal/GIN/GIN.py b/causallearn/search/HiddenCausal/GIN/GIN.py new file mode 100644 index 00000000..5e0963e2 --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/GIN.py @@ -0,0 +1,68 @@ +from causallearn.graph.GeneralGraph import GeneralGraph +from causallearn.graph.GraphNode import GraphNode +from causallearn.graph.NodeType import NodeType +from causallearn.search.FCMBased.lingam.hsic import hsic_test_gamma +from causallearn.utils.KCI.KCI import KCI_UInd + +import causallearn.search.HiddenCausal.GIN.Infer_Causal_Order as ICO +import causallearn.search.HiddenCausal.GIN.Identifying_Causal_Clusters as ICC + + + +def GIN(data, indep_test_method='kerpy', alpha=0.05): + + if indep_test_method not in ['kci', 'hsic', 'kerpy']: + raise NotImplementedError((f"Independent test method {indep_test_method} is not implemented.")) + + if indep_test_method == 'kci': + kci = KCI_UInd() + + if indep_test_method == 'kerpy': + from kerpy.GaussianKernel import GaussianKernel + from independence_testing.HSICSpectralTestObject import HSICSpectralTestObject + kernelX = GaussianKernel(float(0.1)) + kernelY = GaussianKernel(float(0.1)) + + num_samples = data.shape[0] + + myspectralobject = HSICSpectralTestObject(num_samples, kernelX=kernelX, kernelY=kernelY, + kernelX_use_median=False, kernelY_use_median=False, + rff=True, num_rfx=20, num_rfy=20, num_nullsims=500) + + + if indep_test_method == 'kci': + def indep_test(x, y): + return kci.compute_pvalue(x, y)[0] + elif indep_test_method == 'hsic': + def indep_test(x, y): + return hsic_test_gamma(x, y)[1] + elif indep_test_method == 'kerpy': + def indep_test(x, y): + return myspectralobject.compute_pvalue(x, y) + else: + raise NotImplementedError((f"Independent test method {indep_test_method} is not implemented.")) + + + #identifying causal cluster by using fast HSIC, set Signification Level(alhpa) =0.05 + Cluster=ICC.FindCluser(data,indep_test,alpha) + #identifying causal order by using mutual information, (k nearest neighbors (method='1') or sklearn package (method='2') + CausalOrder=ICO.LearnCausalOrder(Cluster,data,method='1') + + + latent_id = 1 + l_nodes = [] + G = GeneralGraph([]) + for cluster in CausalOrder: + l_node = GraphNode(f"L{latent_id}") + l_node.set_node_type(NodeType.LATENT) + l_nodes.append(l_node) + G.add_node(l_node) + for l in l_nodes: + if l != l_node: + G.add_directed_edge(l, l_node) + for o in cluster: + o_node = GraphNode(f"X{o + 1}") + G.add_node(o_node) + G.add_directed_edge(l_node, o_node) + latent_id += 1 + return G, CausalOrder diff --git a/causallearn/search/HiddenCausal/GIN/GIN2.py b/causallearn/search/HiddenCausal/GIN/GIN2.py new file mode 100644 index 00000000..97aac2ac --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/GIN2.py @@ -0,0 +1,94 @@ +import numpy as np +import pandas as pd +import causallearn.search.HiddenCausal.GIN.FisherTest as FisherTest +import causallearn.search.HiddenCausal.GIN.independence as ID + +#GIN by +#X=['X1','X2'] +#Z=['X3'] +#data.type=Pandas.DataFrame +def GIN(X, Z, data, test_function, alpha=0.05): + omega = getomega(data,X,Z) + tdata= data[:, X] + #print(tdata.T) + result = np.dot(tdata, omega)[:,None] + for i in Z: + temp = np.array(data[:, [i]]) + pval =test_function(result, temp) + if pval > alpha: + flag = True + else: + flag = False + + if not flag:#not false == ture ---> if false + #print(X,Z,flag) + return False + + return True + + + +#GIN by Fisher's method +def FisherGIN(X,Z,data,test_function,alpha=0.01): + omega = getomega(data,X,Z) + tdata= data[:,X] + result = np.dot(tdata, omega)[:,None] + pvals=[] + + for i in Z: + temp = np.array(data[:, [i]]) + pval=test_function(result, temp) + pvals.append(pval) + flag,fisher_pval=FisherTest.FisherTest(pvals,alpha) + + return flag + + + + + + +#mthod 1: estimating mutual information by k nearest neighbors (density estimation) +#mthod 2: estimating mutual information by sklearn package +def GIN_MI(X,Z,data,method='1'): + omega = getomega(data,X,Z) + tdata= data[:, X] + result = np.dot(tdata, omega) + MIS=0 + for i in Z: + + temp = np.array(data[:, i]) + if method =='1': + mi=ID.independent(result,temp) + else: + mi=ID.independent11(result,temp) + MIS+=mi + MIS = MIS/len(Z) + + return MIS + + + + + +def getomega(data,X,Z): + cov_m =np.cov(data,rowvar=False) + # print(f'{cov_m.shape = }') + col = list(range(data.shape[1])) + Xlist = [] + Zlist = [] + for i in X: + t = col.index(i) + Xlist.append(t) + for i in Z: + t = col.index(i) + Zlist.append(t) + B = cov_m[Xlist] + B = B[:,Zlist] + A = B.T + u,s,v = np.linalg.svd(A) + lens = len(X) + omega =v.T[:,lens-1] + + return omega + diff --git a/causallearn/search/HiddenCausal/GIN/HSIC2.py b/causallearn/search/HiddenCausal/GIN/HSIC2.py new file mode 100644 index 00000000..8f04c3ca --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/HSIC2.py @@ -0,0 +1,101 @@ +from kerpy.GaussianKernel import GaussianKernel +from numpy import shape,savetxt,loadtxt,transpose,shape,reshape,concatenate +from independence_testing.HSICSpectralTestObject import HSICSpectralTestObject +from independence_testing.HSICBlockTestObject import HSICBlockTestObject +import numpy as np +import pandas as pd + +#Fast HSIC, refer to ./kerpy-master/independence_testing/ExampleHSIC.py + +def test(x,y,alph=0.05): + lens = len(x) + x=x.reshape(lens,1) + y=y.reshape(lens,1) +## +## kernelX=GaussianKernel() +## kernelY=GaussianKernel() + + kernelY = GaussianKernel(float(0.1)) + kernelX=GaussianKernel(float(0.1)) + + + num_samples = lens + + myspectralobject = HSICSpectralTestObject(num_samples, kernelX=kernelX, kernelY=kernelY, + kernelX_use_median=False, kernelY_use_median=False, + rff=True, num_rfx=20, num_rfy=20, num_nullsims=500) + + pvalue = myspectralobject.compute_pvalue(x, y) + + #print(pvalue) + if pvalue >alph: + return True + else: + return False + #return pvalue + +def test2(x,y,alph=0.01): + lens = len(x) + x=x.reshape(lens,1) + y=y.reshape(lens,1) + kernelX=GaussianKernel() + kernelY=GaussianKernel() + + num_samples = lens + + myblockobject = HSICBlockTestObject(num_samples, kernelX=kernelX, kernelY=kernelY, + kernelX_use_median=True, kernelY_use_median=True, + blocksize=80, nullvarmethod='permutation') + + pvalue = myblockobject.compute_pvalue(x, y) + #print(pvalue) + if pvalue >alph: + return True + else: + return False + + + +def INtest(x,y,alph=0.01): + lens = len(x) + x=x.reshape(lens,1) + y=y.reshape(lens,1) + kernelX=GaussianKernel() + kernelY=GaussianKernel() +## kernelY = GaussianKernel(float(0.3)) +## kernelX=GaussianKernel(float(0.3)) + num_samples = lens + + myspectralobject = HSICSpectralTestObject(num_samples, kernelX=kernelX, kernelY=kernelY, + kernelX_use_median=True, kernelY_use_median=True, + rff=True, num_rfx=20, num_rfy=20, num_nullsims=1000) + + pvalue = myspectralobject.compute_pvalue(x, y) + + return pvalue + +def INtest2(x,y,alph=0.01): + lens = len(x) + x=x.reshape(lens,1) + y=y.reshape(lens,1) + kernelX=GaussianKernel() + kernelY=GaussianKernel() + num_samples = lens + + myblockobject = HSICBlockTestObject(num_samples, kernelX=kernelX, kernelY=kernelY, + kernelX_use_median=True, kernelY_use_median=True, + blocksize=200, nullvarmethod='permutation') + + pvalue = myblockobject.compute_pvalue(x, y) + + return pvalue + + + +def main(): + x=np.random.uniform(size=10000) + y=np.random.uniform(size=10000)+x + test2(x,y) + +if __name__ == '__main__': + main() diff --git a/causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py b/causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py new file mode 100644 index 00000000..6052b7cd --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py @@ -0,0 +1,70 @@ +import causallearn.search.HiddenCausal.GIN.GIN2 as GIN2 #based on hsic to find cluster +import causallearn.search.HiddenCausal.GIN.Utils_V2 as Utils #overlap merge utils +import itertools + +#return the combination of list +def FindCombination(Lists,N): + return itertools.combinations(Lists,N) + + +#GIN test by fast HSIC +#X and Z are list, e.g., X=['X1','X2'] Z=['X3'] +#Data.type=Pandas.DataFrame, where data.columns=['x1','x2',...] +def GIN(X,Z,data,test_function,alpha=0.05): + return GIN2.GIN(X,Z,data,test_function,alpha) + #HSIC with fisher method + #return GIN2.FisherGIN(X,Z,data,alpha) + + +#identifying causal cluster from 1-factor to n-factor +#limit=N is to limited the n-factor we found +def FindCluser(data, test_function, alhpa=0.05,limit=0): + indexs=list(range(data.shape[1])) + B = indexs.copy() + Cluster={} + Grlen=2 + + #finding causal cluster, 1-factor to n-factor, recorded by dic type, e.g., {'1':[['x1','x2']],'2':[['x4','x5','x6']} + while len(B) >= Grlen and len(indexs) >=2*Grlen-1: + LatentNum=Grlen-1 + Set_P=FindCombination(B,Grlen) + print('identifying causal cluster: '+str(LatentNum)+'-factor model:') + for P in Set_P: + tind=indexs.copy() + for t in P: + tind.remove(t) # tind= ALLdata\P + if GIN(list(P),tind,data,test_function,alhpa): + key=Cluster.keys() + key = list(key) + if (LatentNum) in key: + temp =Cluster[LatentNum] + temp.append(list(P)) + Cluster[LatentNum]=temp + else: + Cluster[LatentNum]=[list(P)] + key=Cluster.keys() + if LatentNum in key: + Tclu=Cluster[LatentNum] + Tclu=Utils.merge_list(Tclu) + Cluster[LatentNum]=Tclu + #update the B + for i in Tclu: + for j in i: + if j in B: + B.remove(j) + + + Grlen+=1 + print('The identified cluster in identifying '+ str(LatentNum)+ '-factor model:') + print(Cluster) + + #limit the n-factor we found + if limit !=0 and (Grlen-1)>limit: + break + + + + return Cluster + + + diff --git a/causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py b/causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py new file mode 100644 index 00000000..514489c1 --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py @@ -0,0 +1,70 @@ +import causallearn.search.HiddenCausal.GIN.GIN2 as GIN #GIN based on mutual information + + +# identifying causal order by identifying root node recursively. +#cluster.type=dic, which is learned by phase I {identifying causal cluster} +#method 1 = estimating mutual information by k nearest neighbors +def LearnCausalOrder(cluster,data,method='1'): + Cluster=GetCluster(cluster) + #print(Cluster) + + K=[] #causal order + while(len(Cluster) >0): + root = FindRoot(Cluster,data,K,method='1') + K.append(root) + Cluster.remove(root) + return K + +#finding root node by mutual information based GIN +def FindRoot(clu1,data1,K1,method='1'): + cluster=clu1.copy() + data =data1.copy() + K=K1.copy() + if len(cluster) ==1: + return cluster[0] + + MIS=[] + for clu in cluster: + other=cluster.copy() + other.remove(clu) + TempM=0 + for o in other: + X=[] + Z=[] + + for i in range(0,len(clu)): + if i < len(clu)/2: + X.append(clu[i]) + else: + Z.append(clu[i]) + for i in range(0,len(o)): + if i < len(o)/2: + X.append(o[i]) + + + if len(K) != 0: + for i in K: + for j in range(0,len(i)): + if j < int(len(i)/2): + X.append(i[j]) + else: + Z.append(i[j]) + + #print(X,Z) + mi = GIN.GIN_MI(X,Z,data,method='1') + TempM+=mi + TempM =TempM + MIS.append(TempM) + + mins=MIS.index(min(MIS)) + root = cluster[mins] + return root + +def GetCluster(cluster): + Clu=[] + key=cluster.keys() + for i in key: + C=cluster[i] + for j in C: + Clu.append(j) + return Clu diff --git a/causallearn/search/HiddenCausal/GIN/Utils_V2.py b/causallearn/search/HiddenCausal/GIN/Utils_V2.py new file mode 100644 index 00000000..04513229 --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Utils_V2.py @@ -0,0 +1,127 @@ +import numpy as np +import pandas as pd +from itertools import combinations +import itertools +import networkx as nx + + + + + + + +def bfs(graph, start): + visited, queue = set(), [start] + while queue: + vertex = queue.pop(0) + if vertex not in visited: + visited.add(vertex) + queue.extend(graph[vertex] - visited) + return visited + +def connected_components(G): + seen = set() + for v in G: + if v not in seen: + c = set(bfs(G, v)) + yield c + seen.update(c) + +def graph(edge_list): + result = {} + for source, target in edge_list: + result.setdefault(source, set()).add(target) + result.setdefault(target, set()).add(source) + return result + + +def merge_list(L2): + l=L2.copy() + + edges = [] + s = list(map(set, l)) + for i, j in combinations(range(len(s)), r=2): + + if s[i].intersection(s[j]): + edges.append((i, j)) + + G = graph(edges) + + result = [] + unassigned = list(range(len(s))) + + for component in connected_components(G): + + union = set().union(*(s[i] for i in component)) + + result.append(sorted(union)) + + unassigned = [i for i in unassigned if i not in component] + + + result.extend(map(sorted, (s[i] for i in unassigned))) + + return result + + +#merge withour change any order +def merge_listNew(L2): + + + + l=L2.copy() + + + edges = [] + s = list(map(set, l)) + for i, j in combinations(range(len(s)), r=2): + if s[i].intersection(s[j]): + edges.append((i, j)) + + G = graph(edges) + + result = [] + unassigned = list(range(len(s))) + + for component in connected_components(G): + + component=list(component) + + Cluster=l[component[0]] + + adj= s[0].intersection(s[1]) + for i in component: + if i == 0: + continue + temp=l[i] + for j in list(adj): + if j in temp: + temp.remove(j) + + for j in range(0,len(temp)): + if j < (len(temp)/2): + Cluster.insert(0,temp[j]) + else: + Cluster.append(temp[j]) + + result.append(Cluster) + + + + unassigned = [i for i in unassigned if i not in component] + + + result.extend(map(sorted, (s[i] for i in unassigned))) + + return result + + + + +def main(): + L=[['x1','x2'],['x3','x4'],['x5','x6','x2'],['x1','x9']] + print(merge_list(L)) + + +if __name__ == '__main__': + main() diff --git a/causallearn/search/HiddenCausal/GIN/independence.py b/causallearn/search/HiddenCausal/GIN/independence.py new file mode 100644 index 00000000..dd64ef6b --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/independence.py @@ -0,0 +1,38 @@ +import numpy as np +import pandas as pd +import causallearn.search.HiddenCausal.GIN.mutual as MI +from sklearn import metrics + +#estimating mutual information by sklearn +def independent11(x1,y1): + x=x1.copy() + y=y1.copy() + length=len(x) +## x=x.reshape(length,1) +## y=y.reshape(length,1) + x = list(x) + y = list(y) + + result_NMI=metrics.normalized_mutual_info_score(x, y) + print (result_NMI) + return result_NMI + +#estimating mutual information by Non-parametric computation of entropy and mutual-information +def independent(x1,y1): + x=x1.copy() + y=y1.copy() + length=len(x) + x=x.reshape(length,1) + y=y.reshape(length,1) + + if length >3000: + k=15 + else: + k=10 + + mi= MI.mutual_information((x,y),k) + return abs(mi) + + + + diff --git a/causallearn/search/HiddenCausal/GIN/mutual.py b/causallearn/search/HiddenCausal/GIN/mutual.py new file mode 100644 index 00000000..35c4cb8b --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/mutual.py @@ -0,0 +1,276 @@ + + +''' +Non-parametric computation of entropy and mutual-information +Adapted by G Varoquaux for code created by R Brette, itself +from several papers (see in the code). +These computations rely on nearest-neighbor statistics +''' +import numpy as np + +from scipy.special import gamma,psi +from scipy import ndimage +from scipy.linalg import det +from numpy import pi + +from sklearn.neighbors import NearestNeighbors + +__all__=['entropy', 'mutual_information', 'entropy_gaussian'] + +EPS = np.finfo(float).eps + + +def nearest_distances(X, k=1): + ''' + X = array(N,M) + N = number of points + M = number of dimensions + returns the distance to the kth nearest neighbor for every point in X + ''' + knn = NearestNeighbors(n_neighbors=k + 1) + knn.fit(X) + d, _ = knn.kneighbors(X) # the first nearest neighbor is itself + return d[:, -1] # returns the distance to the kth nearest neighbor + + +def entropy_gaussian(C): + ''' + Entropy of a gaussian variable with covariance matrix C + ''' + if np.isscalar(C): # C is the variance + return .5*(1 + np.log(2*pi)) + .5*np.log(C) + else: + n = C.shape[0] # dimension + return .5*n*(1 + np.log(2*pi)) + .5*np.log(abs(det(C))) + + +def entropy2(u,k=1): + k1 = 79.047 + k2 = 7.4129 + gamma = 0.37457 + return (1 + np.log(2 * np.pi)) / 2 - k1 * (np.mean(np.log(np.cosh(u))) - gamma)**2 - k2 * (np.mean(u * np.exp((-u**2) / 2)))**2 + + +def entropy(X, k=1): + ''' Returns the entropy of the X. + Parameters + =========== + X : array-like, shape (n_samples, n_features) + The data the entropy of which is computed + k : int, optional + number of nearest neighbors for density estimation + Notes + ====== + Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy + of a random vector. Probl. Inf. Transm. 23, 95-101. + See also: Evans, D. 2008 A computationally efficient estimator for + mutual information, Proc. R. Soc. A 464 (2093), 1203-1215. + and: + Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual + information. Phys Rev E 69(6 Pt 2):066138. + ''' + + # Distance to kth nearest neighbor + r = nearest_distances(X, k) # squared distances + n, d = X.shape + volume_unit_ball = (pi**(.5*d)) / gamma(.5*d + 1) + ''' + F. Perez-Cruz, (2008). Estimation of Information Theoretic Measures + for Continuous Random Variables. Advances in Neural Information + Processing Systems 21 (NIPS). Vancouver (Canada), December. + return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k) + ''' + return (d*np.mean(np.log(r + np.finfo(X.dtype).eps)) + + np.log(volume_unit_ball) + psi(n) - psi(k)) + + +def mutual_information(variables, k=1): + ''' + Returns the mutual information between any number of variables. + Each variable is a matrix X = array(n_samples, n_features) + where + n = number of samples + dx,dy = number of dimensions + Optionally, the following keyword argument can be specified: + k = number of nearest neighbors for density estimation + Example: mutual_information((X, Y)), mutual_information((X, Y, Z), k=5) + ''' + if len(variables) < 2: + raise AttributeError( + "Mutual information must involve at least 2 variables") + all_vars = np.hstack(variables) + return (sum([entropy(X, k=k) for X in variables]) + - entropy(all_vars, k=k)) + + +def mutual_information_2d(x, y, sigma=1, normalized=False): + """ + Computes (normalized) mutual information between two 1D variate from a + joint histogram. + Parameters + ---------- + x : 1D array + first variable + y : 1D array + second variable + sigma: float + sigma for Gaussian smoothing of the joint histogram + Returns + ------- + nmi: float + the computed similariy measure + """ + bins = (256, 256) + + jh = np.histogram2d(x, y, bins=bins)[0] + + # smooth the jh with a gaussian filter of given sigma + ndimage.gaussian_filter(jh, sigma=sigma, mode='constant', + output=jh) + + # compute marginal histograms + jh = jh + EPS + sh = np.sum(jh) + jh = jh / sh + s1 = np.sum(jh, axis=0).reshape((-1, jh.shape[0])) + s2 = np.sum(jh, axis=1).reshape((jh.shape[1], -1)) + + # Normalised Mutual Information of: + # Studholme, jhill & jhawkes (1998). + # "A normalized entropy measure of 3-D medical image alignment". + # in Proc. Medical Imaging 1998, vol. 3338, San Diego, CA, pp. 132-143. + if normalized: + mi = ((np.sum(s1 * np.log(s1)) + np.sum(s2 * np.log(s2))) + / np.sum(jh * np.log(jh))) - 1 + else: + mi = ( np.sum(jh * np.log(jh)) - np.sum(s1 * np.log(s1)) + - np.sum(s2 * np.log(s2))) + + return mi + + + +############################################################################### +# Tests + +def test_entropy(): + # Testing against correlated Gaussian variables + # (analytical results are known) + # Entropy of a 3-dimensional gaussian variable + rng = np.random.RandomState(0) + n = 50000 + d = 3 + P = np.array([[1, 0, 0], [0, 1, .5], [0, 0, 1]]) + C = np.dot(P, P.T) + Y = rng.randn(d, n) + X = np.dot(P, Y) + H_th = entropy_gaussian(C) + H_est = entropy(X.T, k=5) + # Our estimated entropy should always be less that the actual one + # (entropy estimation undershoots) but not too much + np.testing.assert_array_less(H_est, H_th) + np.testing.assert_array_less(.9*H_th, H_est) + + +def test_mutual_information(): + # Mutual information between two correlated gaussian variables + # Entropy of a 2-dimensional gaussian variable + n = 50000 + rng = np.random.RandomState(0) + #P = np.random.randn(2, 2) + P = np.array([[1, 0], [0.5, 1]]) + C = np.dot(P, P.T) + U = rng.randn(2, n) + Z = np.dot(P, U).T + X = Z[:, 0] + X = X.reshape(len(X), 1) + Y = Z[:, 1] + Y = Y.reshape(len(Y), 1) + # in bits + MI_est = mutual_information((X, Y), k=5) + MI_th = (entropy_gaussian(C[0, 0]) + + entropy_gaussian(C[1, 1]) + - entropy_gaussian(C) + ) + # Our estimator should undershoot once again: it will undershoot more + # for the 2D estimation that for the 1D estimation + print((MI_est, MI_th)) + np.testing.assert_array_less(MI_est, MI_th) + np.testing.assert_array_less(MI_th, MI_est + .3) + + +def test_degenerate(): + # Test that our estimators are well-behaved with regards to + # degenerate solutions + rng = np.random.RandomState(0) + x = rng.randn(50000) + X = np.c_[x, x] + assert np.isfinite(entropy(X)) + assert np.isfinite(mutual_information((x[:, np.newaxis], + x[:, np.newaxis]))) + assert 2.9 < mutual_information_2d(x, x) < 3.1 + + +def test_mutual_information_2d(): + # Mutual information between two correlated gaussian variables + # Entropy of a 2-dimensional gaussian variable + n = 50000 + rng = np.random.RandomState(0) + #P = np.random.randn(2, 2) + P = np.array([[1, 0], [.9, .1]]) + C = np.dot(P, P.T) + U = rng.randn(2, n) + Z = np.dot(P, U).T + X = Z[:, 0] + X = X.reshape(len(X), 1) + Y = Z[:, 1] + Y = Y.reshape(len(Y), 1) + # in bits + MI_est = mutual_information_2d(X.ravel(), Y.ravel()) + MI_th = (entropy_gaussian(C[0, 0]) + + entropy_gaussian(C[1, 1]) + - entropy_gaussian(C) + ) + print((MI_est, MI_th)) + # Our estimator should undershoot once again: it will undershoot more + # for the 2D estimation that for the 1D estimation + np.testing.assert_array_less(MI_est, MI_th) + np.testing.assert_array_less(MI_th, MI_est + .2) + +def _entropy(u): + """Calculate entropy using the maximum entropy approximations.""" + k1 = 79.047 + k2 = 7.4129 + gamma = 0.37457 + return (1 + np.log(2 * np.pi)) / 2 - k1 * (np.mean(np.log(np.cosh(u))) - gamma)**2 - k2 * (np.mean(u * np.exp((-u**2) / 2)))**2 + +def main(): + x=np.random.uniform(size=1000) + y=np.random.uniform(size=1000)*0.2+0.5*x + + mi=mutual_information_2d(x,y,normalized=True) + + x=x.reshape(1000,1) + y=y.reshape(1000,1) + + mi2=mutual_information((x,y),k=10) + print(mi) + print(mi2) +## +## x=x.reshape(10000,1) +## y=y.reshape(10000,1) +## a =entropy(x) +## b=_entropy(x) +## +## print(a) +## print(b) + + + +if __name__ == '__main__': + # Run our tests +## test_entropy() +## test_mutual_information() +## test_degenerate() +## test_mutual_information_2d() + main() \ No newline at end of file From fcbafd3b75c95bfa97e5028b1f94dc573d0d2383 Mon Sep 17 00:00:00 2001 From: Feng Xie <33390790+fengxie009@users.noreply.github.com> Date: Sat, 17 Dec 2022 09:13:29 +0800 Subject: [PATCH 03/14] Delete FisherTest.py --- .../search/HiddenCausal/GIN/FisherTest.py | 45 ------------------- 1 file changed, 45 deletions(-) delete mode 100644 causallearn/search/HiddenCausal/GIN/FisherTest.py diff --git a/causallearn/search/HiddenCausal/GIN/FisherTest.py b/causallearn/search/HiddenCausal/GIN/FisherTest.py deleted file mode 100644 index 0e1de1b8..00000000 --- a/causallearn/search/HiddenCausal/GIN/FisherTest.py +++ /dev/null @@ -1,45 +0,0 @@ -#------------------------------------------------------------------------------- -# Name: 模块1 -# Purpose: -# -# Author: YY -# -# Created: 03/03/2021 -# Copyright: (c) YY 2021 -# Licence: -#------------------------------------------------------------------------------- -import math -from scipy.stats import chi2 - -def FisherTest(pvals,alph=0.01): - Fisher_Stat=0 - L = len(pvals) - for i in range(0,L): - if pvals[i] ==0: - TP = 1e-05 - else: - TP = pvals[i] - - Fisher_Stat = Fisher_Stat-2*math.log(TP) - - - Fisher_pval = 1-chi2.cdf(Fisher_Stat, 2*L) #自由度2*L - - #print(Fisher_pval) - - if Fisher_pval >alph: - return True,Fisher_pval - else: - return False,Fisher_pval - - - - - - -def main(): - pvals = [0.01,0.9] - FisherTest(pvals,0.1) - -if __name__ == '__main__': - main() From 200d8339c6287d3034185e41c016c354686fecf9 Mon Sep 17 00:00:00 2001 From: Feng Xie <33390790+fengxie009@users.noreply.github.com> Date: Sat, 17 Dec 2022 09:13:56 +0800 Subject: [PATCH 04/14] Add files via upload --- .../search/HiddenCausal/GIN/FisherTest.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 causallearn/search/HiddenCausal/GIN/FisherTest.py diff --git a/causallearn/search/HiddenCausal/GIN/FisherTest.py b/causallearn/search/HiddenCausal/GIN/FisherTest.py new file mode 100644 index 00000000..46b9893b --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/FisherTest.py @@ -0,0 +1,21 @@ +import math +from scipy.stats import chi2 + +def FisherTest(pvals, alpha = 0.01): + Fisher_Stat = 0 + L = len(pvals) + for i in range(0,L): + if pvals[i] == 0: + TP = 1e-05 + else: + TP = pvals[i] + + Fisher_Stat = Fisher_Stat - 2*math.log(TP) + + Fisher_pval = 1 - chi2.cdf(Fisher_Stat, 2*L) + + if Fisher_pval >alpha: + return True, Fisher_pval + else: + return False, Fisher_pval + From 0db446d381bd7c1ee4771da58722ae82d8cab279 Mon Sep 17 00:00:00 2001 From: Feng Xie <33390790+fengxie009@users.noreply.github.com> Date: Sat, 17 Dec 2022 09:14:15 +0800 Subject: [PATCH 05/14] Delete GIN.py --- causallearn/search/HiddenCausal/GIN/GIN.py | 68 ---------------------- 1 file changed, 68 deletions(-) delete mode 100644 causallearn/search/HiddenCausal/GIN/GIN.py diff --git a/causallearn/search/HiddenCausal/GIN/GIN.py b/causallearn/search/HiddenCausal/GIN/GIN.py deleted file mode 100644 index 5e0963e2..00000000 --- a/causallearn/search/HiddenCausal/GIN/GIN.py +++ /dev/null @@ -1,68 +0,0 @@ -from causallearn.graph.GeneralGraph import GeneralGraph -from causallearn.graph.GraphNode import GraphNode -from causallearn.graph.NodeType import NodeType -from causallearn.search.FCMBased.lingam.hsic import hsic_test_gamma -from causallearn.utils.KCI.KCI import KCI_UInd - -import causallearn.search.HiddenCausal.GIN.Infer_Causal_Order as ICO -import causallearn.search.HiddenCausal.GIN.Identifying_Causal_Clusters as ICC - - - -def GIN(data, indep_test_method='kerpy', alpha=0.05): - - if indep_test_method not in ['kci', 'hsic', 'kerpy']: - raise NotImplementedError((f"Independent test method {indep_test_method} is not implemented.")) - - if indep_test_method == 'kci': - kci = KCI_UInd() - - if indep_test_method == 'kerpy': - from kerpy.GaussianKernel import GaussianKernel - from independence_testing.HSICSpectralTestObject import HSICSpectralTestObject - kernelX = GaussianKernel(float(0.1)) - kernelY = GaussianKernel(float(0.1)) - - num_samples = data.shape[0] - - myspectralobject = HSICSpectralTestObject(num_samples, kernelX=kernelX, kernelY=kernelY, - kernelX_use_median=False, kernelY_use_median=False, - rff=True, num_rfx=20, num_rfy=20, num_nullsims=500) - - - if indep_test_method == 'kci': - def indep_test(x, y): - return kci.compute_pvalue(x, y)[0] - elif indep_test_method == 'hsic': - def indep_test(x, y): - return hsic_test_gamma(x, y)[1] - elif indep_test_method == 'kerpy': - def indep_test(x, y): - return myspectralobject.compute_pvalue(x, y) - else: - raise NotImplementedError((f"Independent test method {indep_test_method} is not implemented.")) - - - #identifying causal cluster by using fast HSIC, set Signification Level(alhpa) =0.05 - Cluster=ICC.FindCluser(data,indep_test,alpha) - #identifying causal order by using mutual information, (k nearest neighbors (method='1') or sklearn package (method='2') - CausalOrder=ICO.LearnCausalOrder(Cluster,data,method='1') - - - latent_id = 1 - l_nodes = [] - G = GeneralGraph([]) - for cluster in CausalOrder: - l_node = GraphNode(f"L{latent_id}") - l_node.set_node_type(NodeType.LATENT) - l_nodes.append(l_node) - G.add_node(l_node) - for l in l_nodes: - if l != l_node: - G.add_directed_edge(l, l_node) - for o in cluster: - o_node = GraphNode(f"X{o + 1}") - G.add_node(o_node) - G.add_directed_edge(l_node, o_node) - latent_id += 1 - return G, CausalOrder From 9ad04fafb1d831fcf1b8e4bcb65e73a1c43ea73f Mon Sep 17 00:00:00 2001 From: Feng Xie <33390790+fengxie009@users.noreply.github.com> Date: Sat, 17 Dec 2022 09:14:23 +0800 Subject: [PATCH 06/14] Delete GIN2.py --- causallearn/search/HiddenCausal/GIN/GIN2.py | 94 --------------------- 1 file changed, 94 deletions(-) delete mode 100644 causallearn/search/HiddenCausal/GIN/GIN2.py diff --git a/causallearn/search/HiddenCausal/GIN/GIN2.py b/causallearn/search/HiddenCausal/GIN/GIN2.py deleted file mode 100644 index 97aac2ac..00000000 --- a/causallearn/search/HiddenCausal/GIN/GIN2.py +++ /dev/null @@ -1,94 +0,0 @@ -import numpy as np -import pandas as pd -import causallearn.search.HiddenCausal.GIN.FisherTest as FisherTest -import causallearn.search.HiddenCausal.GIN.independence as ID - -#GIN by -#X=['X1','X2'] -#Z=['X3'] -#data.type=Pandas.DataFrame -def GIN(X, Z, data, test_function, alpha=0.05): - omega = getomega(data,X,Z) - tdata= data[:, X] - #print(tdata.T) - result = np.dot(tdata, omega)[:,None] - for i in Z: - temp = np.array(data[:, [i]]) - pval =test_function(result, temp) - if pval > alpha: - flag = True - else: - flag = False - - if not flag:#not false == ture ---> if false - #print(X,Z,flag) - return False - - return True - - - -#GIN by Fisher's method -def FisherGIN(X,Z,data,test_function,alpha=0.01): - omega = getomega(data,X,Z) - tdata= data[:,X] - result = np.dot(tdata, omega)[:,None] - pvals=[] - - for i in Z: - temp = np.array(data[:, [i]]) - pval=test_function(result, temp) - pvals.append(pval) - flag,fisher_pval=FisherTest.FisherTest(pvals,alpha) - - return flag - - - - - - -#mthod 1: estimating mutual information by k nearest neighbors (density estimation) -#mthod 2: estimating mutual information by sklearn package -def GIN_MI(X,Z,data,method='1'): - omega = getomega(data,X,Z) - tdata= data[:, X] - result = np.dot(tdata, omega) - MIS=0 - for i in Z: - - temp = np.array(data[:, i]) - if method =='1': - mi=ID.independent(result,temp) - else: - mi=ID.independent11(result,temp) - MIS+=mi - MIS = MIS/len(Z) - - return MIS - - - - - -def getomega(data,X,Z): - cov_m =np.cov(data,rowvar=False) - # print(f'{cov_m.shape = }') - col = list(range(data.shape[1])) - Xlist = [] - Zlist = [] - for i in X: - t = col.index(i) - Xlist.append(t) - for i in Z: - t = col.index(i) - Zlist.append(t) - B = cov_m[Xlist] - B = B[:,Zlist] - A = B.T - u,s,v = np.linalg.svd(A) - lens = len(X) - omega =v.T[:,lens-1] - - return omega - From 1b2d4c7b2936f887288ba2cfbe854af19c6c0307 Mon Sep 17 00:00:00 2001 From: Feng Xie <33390790+fengxie009@users.noreply.github.com> Date: Sat, 17 Dec 2022 09:15:16 +0800 Subject: [PATCH 07/14] Delete HSIC2.py --- causallearn/search/HiddenCausal/GIN/HSIC2.py | 101 ------------------- 1 file changed, 101 deletions(-) delete mode 100644 causallearn/search/HiddenCausal/GIN/HSIC2.py diff --git a/causallearn/search/HiddenCausal/GIN/HSIC2.py b/causallearn/search/HiddenCausal/GIN/HSIC2.py deleted file mode 100644 index 8f04c3ca..00000000 --- a/causallearn/search/HiddenCausal/GIN/HSIC2.py +++ /dev/null @@ -1,101 +0,0 @@ -from kerpy.GaussianKernel import GaussianKernel -from numpy import shape,savetxt,loadtxt,transpose,shape,reshape,concatenate -from independence_testing.HSICSpectralTestObject import HSICSpectralTestObject -from independence_testing.HSICBlockTestObject import HSICBlockTestObject -import numpy as np -import pandas as pd - -#Fast HSIC, refer to ./kerpy-master/independence_testing/ExampleHSIC.py - -def test(x,y,alph=0.05): - lens = len(x) - x=x.reshape(lens,1) - y=y.reshape(lens,1) -## -## kernelX=GaussianKernel() -## kernelY=GaussianKernel() - - kernelY = GaussianKernel(float(0.1)) - kernelX=GaussianKernel(float(0.1)) - - - num_samples = lens - - myspectralobject = HSICSpectralTestObject(num_samples, kernelX=kernelX, kernelY=kernelY, - kernelX_use_median=False, kernelY_use_median=False, - rff=True, num_rfx=20, num_rfy=20, num_nullsims=500) - - pvalue = myspectralobject.compute_pvalue(x, y) - - #print(pvalue) - if pvalue >alph: - return True - else: - return False - #return pvalue - -def test2(x,y,alph=0.01): - lens = len(x) - x=x.reshape(lens,1) - y=y.reshape(lens,1) - kernelX=GaussianKernel() - kernelY=GaussianKernel() - - num_samples = lens - - myblockobject = HSICBlockTestObject(num_samples, kernelX=kernelX, kernelY=kernelY, - kernelX_use_median=True, kernelY_use_median=True, - blocksize=80, nullvarmethod='permutation') - - pvalue = myblockobject.compute_pvalue(x, y) - #print(pvalue) - if pvalue >alph: - return True - else: - return False - - - -def INtest(x,y,alph=0.01): - lens = len(x) - x=x.reshape(lens,1) - y=y.reshape(lens,1) - kernelX=GaussianKernel() - kernelY=GaussianKernel() -## kernelY = GaussianKernel(float(0.3)) -## kernelX=GaussianKernel(float(0.3)) - num_samples = lens - - myspectralobject = HSICSpectralTestObject(num_samples, kernelX=kernelX, kernelY=kernelY, - kernelX_use_median=True, kernelY_use_median=True, - rff=True, num_rfx=20, num_rfy=20, num_nullsims=1000) - - pvalue = myspectralobject.compute_pvalue(x, y) - - return pvalue - -def INtest2(x,y,alph=0.01): - lens = len(x) - x=x.reshape(lens,1) - y=y.reshape(lens,1) - kernelX=GaussianKernel() - kernelY=GaussianKernel() - num_samples = lens - - myblockobject = HSICBlockTestObject(num_samples, kernelX=kernelX, kernelY=kernelY, - kernelX_use_median=True, kernelY_use_median=True, - blocksize=200, nullvarmethod='permutation') - - pvalue = myblockobject.compute_pvalue(x, y) - - return pvalue - - - -def main(): - x=np.random.uniform(size=10000) - y=np.random.uniform(size=10000)+x - test2(x,y) - -if __name__ == '__main__': - main() From 5229b0682a1ff3eca0326e5027c706ea0f01040d Mon Sep 17 00:00:00 2001 From: Feng Xie <33390790+fengxie009@users.noreply.github.com> Date: Sat, 17 Dec 2022 09:15:24 +0800 Subject: [PATCH 08/14] Delete Identifying_Causal_Clusters.py --- .../GIN/Identifying_Causal_Clusters.py | 70 ------------------- 1 file changed, 70 deletions(-) delete mode 100644 causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py diff --git a/causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py b/causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py deleted file mode 100644 index 6052b7cd..00000000 --- a/causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py +++ /dev/null @@ -1,70 +0,0 @@ -import causallearn.search.HiddenCausal.GIN.GIN2 as GIN2 #based on hsic to find cluster -import causallearn.search.HiddenCausal.GIN.Utils_V2 as Utils #overlap merge utils -import itertools - -#return the combination of list -def FindCombination(Lists,N): - return itertools.combinations(Lists,N) - - -#GIN test by fast HSIC -#X and Z are list, e.g., X=['X1','X2'] Z=['X3'] -#Data.type=Pandas.DataFrame, where data.columns=['x1','x2',...] -def GIN(X,Z,data,test_function,alpha=0.05): - return GIN2.GIN(X,Z,data,test_function,alpha) - #HSIC with fisher method - #return GIN2.FisherGIN(X,Z,data,alpha) - - -#identifying causal cluster from 1-factor to n-factor -#limit=N is to limited the n-factor we found -def FindCluser(data, test_function, alhpa=0.05,limit=0): - indexs=list(range(data.shape[1])) - B = indexs.copy() - Cluster={} - Grlen=2 - - #finding causal cluster, 1-factor to n-factor, recorded by dic type, e.g., {'1':[['x1','x2']],'2':[['x4','x5','x6']} - while len(B) >= Grlen and len(indexs) >=2*Grlen-1: - LatentNum=Grlen-1 - Set_P=FindCombination(B,Grlen) - print('identifying causal cluster: '+str(LatentNum)+'-factor model:') - for P in Set_P: - tind=indexs.copy() - for t in P: - tind.remove(t) # tind= ALLdata\P - if GIN(list(P),tind,data,test_function,alhpa): - key=Cluster.keys() - key = list(key) - if (LatentNum) in key: - temp =Cluster[LatentNum] - temp.append(list(P)) - Cluster[LatentNum]=temp - else: - Cluster[LatentNum]=[list(P)] - key=Cluster.keys() - if LatentNum in key: - Tclu=Cluster[LatentNum] - Tclu=Utils.merge_list(Tclu) - Cluster[LatentNum]=Tclu - #update the B - for i in Tclu: - for j in i: - if j in B: - B.remove(j) - - - Grlen+=1 - print('The identified cluster in identifying '+ str(LatentNum)+ '-factor model:') - print(Cluster) - - #limit the n-factor we found - if limit !=0 and (Grlen-1)>limit: - break - - - - return Cluster - - - From ebb808b9f442fa39d1d7327d7ad75acb79ad45c0 Mon Sep 17 00:00:00 2001 From: Feng Xie <33390790+fengxie009@users.noreply.github.com> Date: Sat, 17 Dec 2022 09:15:31 +0800 Subject: [PATCH 09/14] Delete Infer_Causal_Order.py --- .../HiddenCausal/GIN/Infer_Causal_Order.py | 70 ------------------- 1 file changed, 70 deletions(-) delete mode 100644 causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py diff --git a/causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py b/causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py deleted file mode 100644 index 514489c1..00000000 --- a/causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py +++ /dev/null @@ -1,70 +0,0 @@ -import causallearn.search.HiddenCausal.GIN.GIN2 as GIN #GIN based on mutual information - - -# identifying causal order by identifying root node recursively. -#cluster.type=dic, which is learned by phase I {identifying causal cluster} -#method 1 = estimating mutual information by k nearest neighbors -def LearnCausalOrder(cluster,data,method='1'): - Cluster=GetCluster(cluster) - #print(Cluster) - - K=[] #causal order - while(len(Cluster) >0): - root = FindRoot(Cluster,data,K,method='1') - K.append(root) - Cluster.remove(root) - return K - -#finding root node by mutual information based GIN -def FindRoot(clu1,data1,K1,method='1'): - cluster=clu1.copy() - data =data1.copy() - K=K1.copy() - if len(cluster) ==1: - return cluster[0] - - MIS=[] - for clu in cluster: - other=cluster.copy() - other.remove(clu) - TempM=0 - for o in other: - X=[] - Z=[] - - for i in range(0,len(clu)): - if i < len(clu)/2: - X.append(clu[i]) - else: - Z.append(clu[i]) - for i in range(0,len(o)): - if i < len(o)/2: - X.append(o[i]) - - - if len(K) != 0: - for i in K: - for j in range(0,len(i)): - if j < int(len(i)/2): - X.append(i[j]) - else: - Z.append(i[j]) - - #print(X,Z) - mi = GIN.GIN_MI(X,Z,data,method='1') - TempM+=mi - TempM =TempM - MIS.append(TempM) - - mins=MIS.index(min(MIS)) - root = cluster[mins] - return root - -def GetCluster(cluster): - Clu=[] - key=cluster.keys() - for i in key: - C=cluster[i] - for j in C: - Clu.append(j) - return Clu From 9299ea2ccf3b638ef45c79c65e7cfdf8abe63f46 Mon Sep 17 00:00:00 2001 From: Feng Xie <33390790+fengxie009@users.noreply.github.com> Date: Sat, 17 Dec 2022 09:15:38 +0800 Subject: [PATCH 10/14] Delete Utils_V2.py --- .../search/HiddenCausal/GIN/Utils_V2.py | 127 ------------------ 1 file changed, 127 deletions(-) delete mode 100644 causallearn/search/HiddenCausal/GIN/Utils_V2.py diff --git a/causallearn/search/HiddenCausal/GIN/Utils_V2.py b/causallearn/search/HiddenCausal/GIN/Utils_V2.py deleted file mode 100644 index 04513229..00000000 --- a/causallearn/search/HiddenCausal/GIN/Utils_V2.py +++ /dev/null @@ -1,127 +0,0 @@ -import numpy as np -import pandas as pd -from itertools import combinations -import itertools -import networkx as nx - - - - - - - -def bfs(graph, start): - visited, queue = set(), [start] - while queue: - vertex = queue.pop(0) - if vertex not in visited: - visited.add(vertex) - queue.extend(graph[vertex] - visited) - return visited - -def connected_components(G): - seen = set() - for v in G: - if v not in seen: - c = set(bfs(G, v)) - yield c - seen.update(c) - -def graph(edge_list): - result = {} - for source, target in edge_list: - result.setdefault(source, set()).add(target) - result.setdefault(target, set()).add(source) - return result - - -def merge_list(L2): - l=L2.copy() - - edges = [] - s = list(map(set, l)) - for i, j in combinations(range(len(s)), r=2): - - if s[i].intersection(s[j]): - edges.append((i, j)) - - G = graph(edges) - - result = [] - unassigned = list(range(len(s))) - - for component in connected_components(G): - - union = set().union(*(s[i] for i in component)) - - result.append(sorted(union)) - - unassigned = [i for i in unassigned if i not in component] - - - result.extend(map(sorted, (s[i] for i in unassigned))) - - return result - - -#merge withour change any order -def merge_listNew(L2): - - - - l=L2.copy() - - - edges = [] - s = list(map(set, l)) - for i, j in combinations(range(len(s)), r=2): - if s[i].intersection(s[j]): - edges.append((i, j)) - - G = graph(edges) - - result = [] - unassigned = list(range(len(s))) - - for component in connected_components(G): - - component=list(component) - - Cluster=l[component[0]] - - adj= s[0].intersection(s[1]) - for i in component: - if i == 0: - continue - temp=l[i] - for j in list(adj): - if j in temp: - temp.remove(j) - - for j in range(0,len(temp)): - if j < (len(temp)/2): - Cluster.insert(0,temp[j]) - else: - Cluster.append(temp[j]) - - result.append(Cluster) - - - - unassigned = [i for i in unassigned if i not in component] - - - result.extend(map(sorted, (s[i] for i in unassigned))) - - return result - - - - -def main(): - L=[['x1','x2'],['x3','x4'],['x5','x6','x2'],['x1','x9']] - print(merge_list(L)) - - -if __name__ == '__main__': - main() From 36d5d4bca8cb5d59e6a095bd44b74d2f97825b32 Mon Sep 17 00:00:00 2001 From: Feng Xie <33390790+fengxie009@users.noreply.github.com> Date: Sat, 17 Dec 2022 09:15:45 +0800 Subject: [PATCH 11/14] Delete __init__.py --- causallearn/search/HiddenCausal/GIN/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 causallearn/search/HiddenCausal/GIN/__init__.py diff --git a/causallearn/search/HiddenCausal/GIN/__init__.py b/causallearn/search/HiddenCausal/GIN/__init__.py deleted file mode 100644 index e69de29b..00000000 From c92cb8f6ebf4b63fc2e14f30e6e81338c0e3db34 Mon Sep 17 00:00:00 2001 From: Feng Xie <33390790+fengxie009@users.noreply.github.com> Date: Sat, 17 Dec 2022 09:15:52 +0800 Subject: [PATCH 12/14] Delete independence.py --- .../search/HiddenCausal/GIN/independence.py | 38 ------------------- 1 file changed, 38 deletions(-) delete mode 100644 causallearn/search/HiddenCausal/GIN/independence.py diff --git a/causallearn/search/HiddenCausal/GIN/independence.py b/causallearn/search/HiddenCausal/GIN/independence.py deleted file mode 100644 index dd64ef6b..00000000 --- a/causallearn/search/HiddenCausal/GIN/independence.py +++ /dev/null @@ -1,38 +0,0 @@ -import numpy as np -import pandas as pd -import causallearn.search.HiddenCausal.GIN.mutual as MI -from sklearn import metrics - -#estimating mutual information by sklearn -def independent11(x1,y1): - x=x1.copy() - y=y1.copy() - length=len(x) -## x=x.reshape(length,1) -## y=y.reshape(length,1) - x = list(x) - y = list(y) - - result_NMI=metrics.normalized_mutual_info_score(x, y) - print (result_NMI) - return result_NMI - -#estimating mutual information by Non-parametric computation of entropy and mutual-information -def independent(x1,y1): - x=x1.copy() - y=y1.copy() - length=len(x) - x=x.reshape(length,1) - y=y.reshape(length,1) - - if length >3000: - k=15 - else: - k=10 - - mi= MI.mutual_information((x,y),k) - return abs(mi) - - - - From 4f168e8d235cd4b6deb460b7cd5325811e57d038 Mon Sep 17 00:00:00 2001 From: Feng Xie <33390790+fengxie009@users.noreply.github.com> Date: Sat, 17 Dec 2022 09:15:58 +0800 Subject: [PATCH 13/14] Delete mutual.py --- causallearn/search/HiddenCausal/GIN/mutual.py | 276 ------------------ 1 file changed, 276 deletions(-) delete mode 100644 causallearn/search/HiddenCausal/GIN/mutual.py diff --git a/causallearn/search/HiddenCausal/GIN/mutual.py b/causallearn/search/HiddenCausal/GIN/mutual.py deleted file mode 100644 index 35c4cb8b..00000000 --- a/causallearn/search/HiddenCausal/GIN/mutual.py +++ /dev/null @@ -1,276 +0,0 @@ - - -''' -Non-parametric computation of entropy and mutual-information -Adapted by G Varoquaux for code created by R Brette, itself -from several papers (see in the code). -These computations rely on nearest-neighbor statistics -''' -import numpy as np - -from scipy.special import gamma,psi -from scipy import ndimage -from scipy.linalg import det -from numpy import pi - -from sklearn.neighbors import NearestNeighbors - -__all__=['entropy', 'mutual_information', 'entropy_gaussian'] - -EPS = np.finfo(float).eps - - -def nearest_distances(X, k=1): - ''' - X = array(N,M) - N = number of points - M = number of dimensions - returns the distance to the kth nearest neighbor for every point in X - ''' - knn = NearestNeighbors(n_neighbors=k + 1) - knn.fit(X) - d, _ = knn.kneighbors(X) # the first nearest neighbor is itself - return d[:, -1] # returns the distance to the kth nearest neighbor - - -def entropy_gaussian(C): - ''' - Entropy of a gaussian variable with covariance matrix C - ''' - if np.isscalar(C): # C is the variance - return .5*(1 + np.log(2*pi)) + .5*np.log(C) - else: - n = C.shape[0] # dimension - return .5*n*(1 + np.log(2*pi)) + .5*np.log(abs(det(C))) - - -def entropy2(u,k=1): - k1 = 79.047 - k2 = 7.4129 - gamma = 0.37457 - return (1 + np.log(2 * np.pi)) / 2 - k1 * (np.mean(np.log(np.cosh(u))) - gamma)**2 - k2 * (np.mean(u * np.exp((-u**2) / 2)))**2 - - -def entropy(X, k=1): - ''' Returns the entropy of the X. - Parameters - =========== - X : array-like, shape (n_samples, n_features) - The data the entropy of which is computed - k : int, optional - number of nearest neighbors for density estimation - Notes - ====== - Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy - of a random vector. Probl. Inf. Transm. 23, 95-101. - See also: Evans, D. 2008 A computationally efficient estimator for - mutual information, Proc. R. Soc. A 464 (2093), 1203-1215. - and: - Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual - information. Phys Rev E 69(6 Pt 2):066138. - ''' - - # Distance to kth nearest neighbor - r = nearest_distances(X, k) # squared distances - n, d = X.shape - volume_unit_ball = (pi**(.5*d)) / gamma(.5*d + 1) - ''' - F. Perez-Cruz, (2008). Estimation of Information Theoretic Measures - for Continuous Random Variables. Advances in Neural Information - Processing Systems 21 (NIPS). Vancouver (Canada), December. - return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k) - ''' - return (d*np.mean(np.log(r + np.finfo(X.dtype).eps)) - + np.log(volume_unit_ball) + psi(n) - psi(k)) - - -def mutual_information(variables, k=1): - ''' - Returns the mutual information between any number of variables. - Each variable is a matrix X = array(n_samples, n_features) - where - n = number of samples - dx,dy = number of dimensions - Optionally, the following keyword argument can be specified: - k = number of nearest neighbors for density estimation - Example: mutual_information((X, Y)), mutual_information((X, Y, Z), k=5) - ''' - if len(variables) < 2: - raise AttributeError( - "Mutual information must involve at least 2 variables") - all_vars = np.hstack(variables) - return (sum([entropy(X, k=k) for X in variables]) - - entropy(all_vars, k=k)) - - -def mutual_information_2d(x, y, sigma=1, normalized=False): - """ - Computes (normalized) mutual information between two 1D variate from a - joint histogram. - Parameters - ---------- - x : 1D array - first variable - y : 1D array - second variable - sigma: float - sigma for Gaussian smoothing of the joint histogram - Returns - ------- - nmi: float - the computed similariy measure - """ - bins = (256, 256) - - jh = np.histogram2d(x, y, bins=bins)[0] - - # smooth the jh with a gaussian filter of given sigma - ndimage.gaussian_filter(jh, sigma=sigma, mode='constant', - output=jh) - - # compute marginal histograms - jh = jh + EPS - sh = np.sum(jh) - jh = jh / sh - s1 = np.sum(jh, axis=0).reshape((-1, jh.shape[0])) - s2 = np.sum(jh, axis=1).reshape((jh.shape[1], -1)) - - # Normalised Mutual Information of: - # Studholme, jhill & jhawkes (1998). - # "A normalized entropy measure of 3-D medical image alignment". - # in Proc. Medical Imaging 1998, vol. 3338, San Diego, CA, pp. 132-143. - if normalized: - mi = ((np.sum(s1 * np.log(s1)) + np.sum(s2 * np.log(s2))) - / np.sum(jh * np.log(jh))) - 1 - else: - mi = ( np.sum(jh * np.log(jh)) - np.sum(s1 * np.log(s1)) - - np.sum(s2 * np.log(s2))) - - return mi - - - -############################################################################### -# Tests - -def test_entropy(): - # Testing against correlated Gaussian variables - # (analytical results are known) - # Entropy of a 3-dimensional gaussian variable - rng = np.random.RandomState(0) - n = 50000 - d = 3 - P = np.array([[1, 0, 0], [0, 1, .5], [0, 0, 1]]) - C = np.dot(P, P.T) - Y = rng.randn(d, n) - X = np.dot(P, Y) - H_th = entropy_gaussian(C) - H_est = entropy(X.T, k=5) - # Our estimated entropy should always be less that the actual one - # (entropy estimation undershoots) but not too much - np.testing.assert_array_less(H_est, H_th) - np.testing.assert_array_less(.9*H_th, H_est) - - -def test_mutual_information(): - # Mutual information between two correlated gaussian variables - # Entropy of a 2-dimensional gaussian variable - n = 50000 - rng = np.random.RandomState(0) - #P = np.random.randn(2, 2) - P = np.array([[1, 0], [0.5, 1]]) - C = np.dot(P, P.T) - U = rng.randn(2, n) - Z = np.dot(P, U).T - X = Z[:, 0] - X = X.reshape(len(X), 1) - Y = Z[:, 1] - Y = Y.reshape(len(Y), 1) - # in bits - MI_est = mutual_information((X, Y), k=5) - MI_th = (entropy_gaussian(C[0, 0]) - + entropy_gaussian(C[1, 1]) - - entropy_gaussian(C) - ) - # Our estimator should undershoot once again: it will undershoot more - # for the 2D estimation that for the 1D estimation - print((MI_est, MI_th)) - np.testing.assert_array_less(MI_est, MI_th) - np.testing.assert_array_less(MI_th, MI_est + .3) - - -def test_degenerate(): - # Test that our estimators are well-behaved with regards to - # degenerate solutions - rng = np.random.RandomState(0) - x = rng.randn(50000) - X = np.c_[x, x] - assert np.isfinite(entropy(X)) - assert np.isfinite(mutual_information((x[:, np.newaxis], - x[:, np.newaxis]))) - assert 2.9 < mutual_information_2d(x, x) < 3.1 - - -def test_mutual_information_2d(): - # Mutual information between two correlated gaussian variables - # Entropy of a 2-dimensional gaussian variable - n = 50000 - rng = np.random.RandomState(0) - #P = np.random.randn(2, 2) - P = np.array([[1, 0], [.9, .1]]) - C = np.dot(P, P.T) - U = rng.randn(2, n) - Z = np.dot(P, U).T - X = Z[:, 0] - X = X.reshape(len(X), 1) - Y = Z[:, 1] - Y = Y.reshape(len(Y), 1) - # in bits - MI_est = mutual_information_2d(X.ravel(), Y.ravel()) - MI_th = (entropy_gaussian(C[0, 0]) - + entropy_gaussian(C[1, 1]) - - entropy_gaussian(C) - ) - print((MI_est, MI_th)) - # Our estimator should undershoot once again: it will undershoot more - # for the 2D estimation that for the 1D estimation - np.testing.assert_array_less(MI_est, MI_th) - np.testing.assert_array_less(MI_th, MI_est + .2) - -def _entropy(u): - """Calculate entropy using the maximum entropy approximations.""" - k1 = 79.047 - k2 = 7.4129 - gamma = 0.37457 - return (1 + np.log(2 * np.pi)) / 2 - k1 * (np.mean(np.log(np.cosh(u))) - gamma)**2 - k2 * (np.mean(u * np.exp((-u**2) / 2)))**2 - -def main(): - x=np.random.uniform(size=1000) - y=np.random.uniform(size=1000)*0.2+0.5*x - - mi=mutual_information_2d(x,y,normalized=True) - - x=x.reshape(1000,1) - y=y.reshape(1000,1) - - mi2=mutual_information((x,y),k=10) - print(mi) - print(mi2) -## -## x=x.reshape(10000,1) -## y=y.reshape(10000,1) -## a =entropy(x) -## b=_entropy(x) -## -## print(a) -## print(b) - - - -if __name__ == '__main__': - # Run our tests -## test_entropy() -## test_mutual_information() -## test_degenerate() -## test_mutual_information_2d() - main() \ No newline at end of file From e30742605f70fc0b70964f5d408f2d460d7d6f65 Mon Sep 17 00:00:00 2001 From: Feng Xie <33390790+fengxie009@users.noreply.github.com> Date: Sat, 17 Dec 2022 09:16:19 +0800 Subject: [PATCH 14/14] Add files via upload --- causallearn/search/HiddenCausal/GIN/GIN.py | 75 +++++++ .../GIN/Identifying_Causal_Clusters.py | 72 +++++++ .../HiddenCausal/GIN/Infer_Causal_Order.py | 101 ++++++++++ .../search/HiddenCausal/GIN/Merge_Cluster.py | 62 ++++++ .../HiddenCausal/GIN/Mutual_Information.py | 185 ++++++++++++++++++ .../HiddenCausal/GIN/Test_GIN_Condition.py | 81 ++++++++ 6 files changed, 576 insertions(+) create mode 100644 causallearn/search/HiddenCausal/GIN/GIN.py create mode 100644 causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py create mode 100644 causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py create mode 100644 causallearn/search/HiddenCausal/GIN/Merge_Cluster.py create mode 100644 causallearn/search/HiddenCausal/GIN/Mutual_Information.py create mode 100644 causallearn/search/HiddenCausal/GIN/Test_GIN_Condition.py diff --git a/causallearn/search/HiddenCausal/GIN/GIN.py b/causallearn/search/HiddenCausal/GIN/GIN.py new file mode 100644 index 00000000..f0a762a3 --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/GIN.py @@ -0,0 +1,75 @@ +from causallearn.graph.GeneralGraph import GeneralGraph +from causallearn.graph.GraphNode import GraphNode +from causallearn.graph.NodeType import NodeType +from causallearn.search.FCMBased.lingam.hsic import hsic_test_gamma +from causallearn.utils.KCI.KCI import KCI_UInd + +import causallearn.search.HiddenCausal.GIN.Infer_Causal_Order as ICO +import causallearn.search.HiddenCausal.GIN.Identifying_Causal_Clusters as ICC + + + +def GIN(data, indep_test_method='kci', alpha=0.05, MAX_Factor = 2): + ''' + Learning causal structure of Latent Variables for Linear Non-Gaussian Latent Variable Model + with Generalized Independent Noise Condition + Parameters + ---------- + data : numpy ndarray + data set + indep_test_method : str, default='kci' + the name of the independence test being used + alpha : float, default=0.05 + desired significance level of independence tests (p_value) in (0,1) + MAX_Factor : int + Prior maximum factor number + Returns + ------- + G : general graph + causal graph + causal_order : list + causal order + ''' + if indep_test_method not in ['kci', 'hsic']: + raise NotImplementedError((f"Independent test method {indep_test_method} is not implemented.")) + + if indep_test_method == 'kci': + kci = KCI_UInd() + + if indep_test_method == 'kci': + def indep_test(x, y): + return kci.compute_pvalue(x, y)[0] + elif indep_test_method == 'hsic': + def indep_test(x, y): + return hsic_test_gamma(x, y)[1] + else: + raise NotImplementedError((f"Independent test method {indep_test_method} is not implemented.")) + + + '''identifying causal cluster by using HSIC (KCI)''' + Cluster=ICC.FindCluser(data, indep_test, alpha, MAX_Factor) + '''identifying causal order by using mutual information, (k nearest neighbors )''' + CausalOrder=ICO.LearnCausalOrder(Cluster, data) + + + latent_id = 1 + l_nodes = [] + G = GeneralGraph([]) + for cluster in CausalOrder: + l_node = GraphNode(f"L{latent_id}") + l_node.set_node_type(NodeType.LATENT) + l_nodes.append(l_node) + G.add_node(l_node) + for l in l_nodes: + if l != l_node: + G.add_directed_edge(l, l_node) + for o in cluster: + o_node = GraphNode(f"X{o + 1}") + G.add_node(o_node) + G.add_directed_edge(l_node, o_node) + latent_id += 1 + + return G, CausalOrder + + + diff --git a/causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py b/causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py new file mode 100644 index 00000000..d7f8170f --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Identifying_Causal_Clusters.py @@ -0,0 +1,72 @@ +import causallearn.search.HiddenCausal.GIN.Test_GIN_Condition as Test_GIN_Condition #based on hsic to find cluster +import causallearn.search.HiddenCausal.GIN.Merge_Cluster as Merge_Cluster #overlap merge utils +import itertools + + +def FindCluser(data, test_function, alpha=0.05, Max_Factor = 2): + ''' + Learning causal Cluster for Linear Non-Gaussian Latent Variable Model with Generalized Independent Noise Condition + Parameters + ---------- + data : numpy ndarray + data set + indep_test_method : str, default='kci' + the name of the independence test being used + alpha : float, default=0.05 + desired significance level of independence tests (p_value) in (0,1) + Max_Factor: int + Maximum number of factors + Returns + ------- + Cluster: dic + Causal Cluster, e.g., {'1':[['x1','x2']],'2':[['x4','x5','x6']} + ''' + #Initialize variable set + indexs = list(range(data.shape[1])) + B = indexs.copy() + Cluster = {} + Grlen = 2 + + while len(B) >= Grlen and len(indexs) >=2*Grlen-1: + LatentNum = Grlen-1 + Set_P = itertools.combinations(B, Grlen) + print('Identifying causal cluster with '+str(LatentNum)+'-factor model:') + for P in Set_P: + tind = indexs.copy() + for t in P: + tind.remove(t) # tind= ALLdata\P + if GIN(list(P), tind, data, test_function, alpha): + key = list(Cluster.keys()) + if (LatentNum) in key: + temp = Cluster[LatentNum] + temp.append(list(P)) + Cluster[LatentNum] = temp + else: + Cluster[LatentNum] = [list(P)] + + print('Debug------------',Cluster) + # Merge overlap cluster and update dataset + key = Cluster.keys() + if LatentNum in key: + Tclu = Merge_Cluster.merge_list(Cluster[LatentNum]) + Cluster[LatentNum] = Tclu + for i in Tclu: + for j in i: + if j in B: + B.remove(j) + + Grlen += 1 + print('The identified cluster for '+ str(LatentNum)+ '-factor model:', Cluster) + + if Max_Factor !=0 and (Grlen-1)>Max_Factor: + break + + return Cluster + + + +def GIN(X, Z, data, test_function, alpha=0.05): + return Test_GIN_Condition.GIN(X, Z, data, test_function, alpha) + #Fisher method + #return Test_GIN_Condition.FisherGIN(X, Z, data, test_function, alpha) + diff --git a/causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py b/causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py new file mode 100644 index 00000000..cc3813d8 --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Infer_Causal_Order.py @@ -0,0 +1,101 @@ +import causallearn.search.HiddenCausal.GIN.Test_GIN_Condition as GIN #GIN based on mutual information + + + +def LearnCausalOrder(cluster, data): + ''' + Learning causal order for Linear Non-Gaussian Latent Variable Model with Generalized Independent Noise Condition + Parameters + ---------- + cluster: dic + causal cluster learned by phase I + data : numpy ndarray + data set + Returns + ------- + causal_order : list + causal order + ''' + Cluster = GetCluster(cluster) + K = [] # Initize causal order + while(len(Cluster) >0): + root = FindRoot(Cluster, data, K) + K.append(root) + Cluster.remove(root) + return K + + + + +def FindRoot(clusters, data, causal_order): + ''' + Find the causal order by statistics of dependence + Parameters + ---------- + data : data set (numpy ndarray) + cov : covariance matrix + clusters : clusters of observed variables + causal_order : causal order + Returns + ------- + root : latent root cause + ''' + if len(clusters) == 1: + return clusters[0] + root = clusters[0] + MI_lists=[] + for i in clusters: + MI=0 + for j in clusters: + if i == j: + continue + + X = [j[0]] + Z = [] + r_1, r_2 = array_split(i, 2) + X = X + r_1 + Z = Z + r_2 + + if causal_order: + for k in causal_order: + k_1, k_2 = array_split(k, 2) + X = X + k_1 + Z = Z + k_2 + + print('Debug as Mutual Information: ', X , Z) + tmi = GIN.GIN_MI(X, Z, data) + MI+=tmi + + MI_lists.append(MI) + print('Debug as Mutual Information: (All results of MI)', MI_lists) + + mins=MI_lists.index(min(MI_lists)) + root = clusters[mins] + return root + + + +def GetCluster(cluster): + Clu = [] + key = cluster.keys() + for i in key: + C = cluster[i] + for j in C: + Clu.append(j) + return Clu + +def array_split(x, k): + x_len = len(x) + # div_points = [] + sub_arys = [] + start = 0 + section_len = x_len // k + extra = x_len % k + for i in range(extra): + sub_arys.append(x[start:start + section_len + 1]) + start = start + section_len + 1 + + for i in range(k - extra): + sub_arys.append(x[start:start + section_len]) + start = start + section_len + return sub_arys \ No newline at end of file diff --git a/causallearn/search/HiddenCausal/GIN/Merge_Cluster.py b/causallearn/search/HiddenCausal/GIN/Merge_Cluster.py new file mode 100644 index 00000000..37242783 --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Merge_Cluster.py @@ -0,0 +1,62 @@ +import numpy as np +import pandas as pd +from itertools import combinations +import itertools +import networkx as nx + +'''Merge overlap causal cluster''' +def merge_list(L2): + l = L2.copy() + + edges = [] + s = list(map(set, l)) + for i, j in combinations(range(len(s)), r=2): + + if s[i].intersection(s[j]): + edges.append((i, j)) + + G = graph(edges) + + result = [] + unassigned = list(range(len(s))) + + for component in connected_components(G): + union = set().union(*(s[i] for i in component)) + result.append(sorted(union)) + unassigned = [i for i in unassigned if i not in component] + + result.extend(map(sorted, (s[i] for i in unassigned))) + + return result + + +def bfs(graph, start): + visited, queue = set(), [start] + while queue: + vertex = queue.pop(0) + if vertex not in visited: + visited.add(vertex) + queue.extend(graph[vertex] - visited) + return visited + +def connected_components(G): + seen = set() + for v in G: + if v not in seen: + c = set(bfs(G, v)) + yield c + seen.update(c) + +def graph(edge_list): + result = {} + for source, target in edge_list: + result.setdefault(source, set()).add(target) + result.setdefault(target, set()).add(source) + return result + + + + + + + diff --git a/causallearn/search/HiddenCausal/GIN/Mutual_Information.py b/causallearn/search/HiddenCausal/GIN/Mutual_Information.py new file mode 100644 index 00000000..037aa0e8 --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Mutual_Information.py @@ -0,0 +1,185 @@ +''' +Non-parametric computation of entropy and mutual-information +Adapted by G Varoquaux for code created by R Brette, itself +from several papers (see in the code). +These computations rely on nearest-neighbor statistics +''' +import numpy as np +import pandas as pd +from sklearn import metrics + +from scipy.special import gamma,psi +from scipy import ndimage +from scipy.linalg import det +from numpy import pi + +from sklearn.neighbors import NearestNeighbors + +__all__=['entropy', 'mutual_information', 'entropy_gaussian'] + +EPS = np.finfo(float).eps + + +def nearest_distances(X, k=1): + ''' + X = array(N,M) + N = number of points + M = number of dimensions + returns the distance to the kth nearest neighbor for every point in X + ''' + knn = NearestNeighbors(n_neighbors=k + 1) + knn.fit(X) + d, _ = knn.kneighbors(X) # the first nearest neighbor is itself + return d[:, -1] # returns the distance to the kth nearest neighbor + + +def entropy_gaussian(C): + ''' + Entropy of a gaussian variable with covariance matrix C + ''' + if np.isscalar(C): # C is the variance + return .5*(1 + np.log(2*pi)) + .5*np.log(C) + else: + n = C.shape[0] # dimension + return .5*n*(1 + np.log(2*pi)) + .5*np.log(abs(det(C))) + + +def entropy2(u,k=1): + k1 = 79.047 + k2 = 7.4129 + gamma = 0.37457 + return (1 + np.log(2 * np.pi)) / 2 - k1 * (np.mean(np.log(np.cosh(u))) - gamma)**2 - k2 * (np.mean(u * np.exp((-u**2) / 2)))**2 + + +def entropy(X, k=1): + ''' Returns the entropy of the X. + Parameters + =========== + X : array-like, shape (n_samples, n_features) + The data the entropy of which is computed + k : int, optional + number of nearest neighbors for density estimation + Notes + ====== + Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy + of a random vector. Probl. Inf. Transm. 23, 95-101. + See also: Evans, D. 2008 A computationally efficient estimator for + mutual information, Proc. R. Soc. A 464 (2093), 1203-1215. + and: + Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual + information. Phys Rev E 69(6 Pt 2):066138. + ''' + + # Distance to kth nearest neighbor + r = nearest_distances(X, k) # squared distances + n, d = X.shape + volume_unit_ball = (pi**(.5*d)) / gamma(.5*d + 1) + ''' + F. Perez-Cruz, (2008). Estimation of Information Theoretic Measures + for Continuous Random Variables. Advances in Neural Information + Processing Systems 21 (NIPS). Vancouver (Canada), December. + return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k) + ''' + return (d*np.mean(np.log(r + np.finfo(X.dtype).eps)) + + np.log(volume_unit_ball) + psi(n) - psi(k)) + + +def mutual_information(variables, k=1): + ''' + Returns the mutual information between any number of variables. + Each variable is a matrix X = array(n_samples, n_features) + where + n = number of samples + dx,dy = number of dimensions + Optionally, the following keyword argument can be specified: + k = number of nearest neighbors for density estimation + Example: mutual_information((X, Y)), mutual_information((X, Y, Z), k=5) + ''' + if len(variables) < 2: + raise AttributeError( + "Mutual information must involve at least 2 variables") + all_vars = np.hstack(variables) + return (sum([entropy(X, k=k) for X in variables]) + - entropy(all_vars, k=k)) + + +def mutual_information_2d(x, y, sigma=1, normalized=False): + """ + Computes (normalized) mutual information between two 1D variate from a + joint histogram. + Parameters + ---------- + x : 1D array + first variable + y : 1D array + second variable + sigma: float + sigma for Gaussian smoothing of the joint histogram + Returns + ------- + nmi: float + the computed similariy measure + """ + bins = (256, 256) + + jh = np.histogram2d(x, y, bins=bins)[0] + + # smooth the jh with a gaussian filter of given sigma + ndimage.gaussian_filter(jh, sigma=sigma, mode='constant', + output=jh) + + # compute marginal histograms + jh = jh + EPS + sh = np.sum(jh) + jh = jh / sh + s1 = np.sum(jh, axis=0).reshape((-1, jh.shape[0])) + s2 = np.sum(jh, axis=1).reshape((jh.shape[1], -1)) + + # Normalised Mutual Information of: + # Studholme, jhill & jhawkes (1998). + # "A normalized entropy measure of 3-D medical image alignment". + # in Proc. Medical Imaging 1998, vol. 3338, San Diego, CA, pp. 132-143. + if normalized: + mi = ((np.sum(s1 * np.log(s1)) + np.sum(s2 * np.log(s2))) + / np.sum(jh * np.log(jh))) - 1 + else: + mi = ( np.sum(jh * np.log(jh)) - np.sum(s1 * np.log(s1)) + - np.sum(s2 * np.log(s2))) + + return mi + + +def _entropy(u): + """Calculate entropy using the maximum entropy approximations.""" + k1 = 79.047 + k2 = 7.4129 + gamma = 0.37457 + return (1 + np.log(2 * np.pi)) / 2 - k1 * (np.mean(np.log(np.cosh(u))) - gamma)**2 - k2 * (np.mean(u * np.exp((-u**2) / 2)))**2 + + + + + +def MI_2(x1, y1): + """estimating mutual information by sklearn""" + x = list(x1.copy()) + y = list(y1.copy()) + + result_NMI = metrics.normalized_mutual_info_score(x, y) + + return result_NMI + + +def MI_1(x1, y1): + """estimating mutual information by Non-parametric computation of entropy and mutual-information""" + x=x1.reshape(len(x1),1) + y=y1.reshape(len(y1),1) + + # set the number of neighborhood + if len(x) > 3000: + k = 15 + else: + k = 10 + + mi = mutual_information((x, y), k) + return abs(mi) \ No newline at end of file diff --git a/causallearn/search/HiddenCausal/GIN/Test_GIN_Condition.py b/causallearn/search/HiddenCausal/GIN/Test_GIN_Condition.py new file mode 100644 index 00000000..5625257b --- /dev/null +++ b/causallearn/search/HiddenCausal/GIN/Test_GIN_Condition.py @@ -0,0 +1,81 @@ +import numpy as np +import pandas as pd +import causallearn.search.HiddenCausal.GIN.FisherTest as FisherTest +import causallearn.search.HiddenCausal.GIN.Mutual_Information as ID + + + +def GIN(X, Z, data, test_function, alpha = 0.05): + ''' + Generalized Independent Noise Condition Test + Parameters + ---------- + data : numpy ndarray + data set + X, Z : list + + test_function : str, default='kci' + the name of the independence test being used + alpha : float, default=0.05 + desired significance level of independence tests (p_value) in (0,1) + Returns + ------- + Boolean : True or False + ''' + omega = getomega(data, X, Z) + tdata = data[:,X] + result = np.dot(tdata, omega)[:,None] + for i in Z: + temp = np.array(data[:, [i]]) + pval = test_function(result, temp) + if pval > alpha: + flag = True + else: + flag = False + + if not flag: + return False + + return True + + +def GIN_MI(X, Z, data): + """Method : Calculate mutual information by k nearest neighbors (density estimation) """ + omega = getomega(data, X, Z) + tdata = data[:, X] + result = np.dot(tdata, omega) + MIS = 0 + for i in Z: + temp = np.array(data[:, i]) + mi = ID.MI_1(result, temp) + MIS += mi + MIS = MIS/len(Z) + + return MIS + + +def FisherGIN(X, Z, data, test_function, alpha = 0.01): + """Test GIN Condition by fisher method """ + omega = getomega(data, X, Z) + tdata = data[:, X] + result = np.dot(tdata, omega)[:,None] + pvals = [] + + for i in Z: + temp = np.array(data[:, [i]]) + pval = test_function(result, temp) + pvals.append(pval) + + flag, fisher_pval = FisherTest.FisherTest(pvals, alpha) + + return flag + +def getomega(data, X, Z): + cov = np.cov(data, rowvar=False) + cov_m = cov[np.ix_(Z, X)] + _, _, v = np.linalg.svd(cov_m) + omega = v.T[:, -1] + return omega + + +