DoWhy example on ihdp (Infant Health and Development Program) dataset
[1]:
# importing required libraries
import os, sys
sys.path.append(os.path.abspath("../../"))
import dowhy
from dowhy import CausalModel
import pandas as pd
import numpy as np
Loading Data
[2]:
data= pd.read_csv("https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv", header = None)
col = ["treatment", "y_factual", "y_cfactual", "mu0", "mu1" ,]
for i in range(1,26):
col.append("x"+str(i))
data.columns = col
data.head()
[2]:
treatment | y_factual | y_cfactual | mu0 | mu1 | x1 | x2 | x3 | x4 | x5 | ... | x16 | x17 | x18 | x19 | x20 | x21 | x22 | x23 | x24 | x25 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5.599916 | 4.318780 | 3.268256 | 6.854457 | -0.528603 | -0.343455 | 1.128554 | 0.161703 | -0.316603 | ... | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 6.875856 | 7.856495 | 6.636059 | 7.562718 | -1.736945 | -1.802002 | 0.383828 | 2.244320 | -0.629189 | ... | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 2.996273 | 6.633952 | 1.570536 | 6.121617 | -0.807451 | -0.202946 | -0.360898 | -0.879606 | 0.808706 | ... | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 1.366206 | 5.697239 | 1.244738 | 5.889125 | 0.390083 | 0.596582 | -1.850350 | -0.879606 | -0.004017 | ... | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 1.963538 | 6.202582 | 1.685048 | 6.191994 | -1.045229 | -0.602710 | 0.011465 | 0.161703 | 0.683672 | ... | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 30 columns
1.Model
[3]:
# Create a causal model from the data and given common causes.
xs = ""
for i in range(1,26):
xs += ("x"+str(i)+"+")
model=CausalModel(
data = data,
treatment='treatment',
outcome='y_factual',
common_causes=xs.split('+')
)
WARNING:dowhy.causal_model:Causal Graph not provided. DoWhy will construct a graph based on data inputs. INFO:dowhy.causal_model:Model to find the causal effect of treatment ['treatment'] on outcome ['y_factual']
2.Identify
[4]:
#Identify the causal effect
identified_estimand = model.identify_effect()
INFO:dowhy.causal_identifier:Common causes of treatment and outcome:['', 'x1', 'x7', 'x16', 'x6', 'x19', 'x17', 'x21', 'x14', 'x25', 'x10', 'x9', 'x12', 'x2', 'x5', 'x24', 'x15', 'x3', 'x20', 'x8', 'x23', 'x4', 'x22', 'x18', 'x11', 'x13'] WARNING:dowhy.causal_identifier:There are unobserved common causes. Causal effect cannot be identified.
WARN: Do you want to continue by ignoring these unobserved confounders? [y/n] y
INFO:dowhy.causal_identifier:Instrumental variables for treatment and outcome:[]
3. Estimate (using different methods) 3. 估算(使用不同的方法)
3.1 Using Linear Regression 3.1使用线性回归
[5]:
# Estimate the causal effect and compare it with Average Treatment Effect
estimate = model.estimate_effect(identified_estimand,
method_name="backdoor.linear_regression", test_significance=True
)
print(estimate)
print("Causal Estimate is " + str(estimate.value))
data_1 = data[data["treatment"]==1]
data_0 = data[data["treatment"]==0]
print("ATE", np.mean(data_1["y_factual"])- np.mean(data_0["y_factual"]))
INFO:dowhy.causal_estimator:INFO: Using Linear Regression Estimator INFO:dowhy.causal_estimator:b: y_factual~treatment+x1+x7+x16+x6+x19+x17+x21+x14+x25+x10+x9+x12+x2+x5+x24+x15+x3+x20+x8+x23+x4+x22+x18+x11+x13
*** Causal Estimate *** ## Target estimand Estimand type: ate ### Estimand : 1 Estimand name: iv No such variable found! ### Estimand : 2 Estimand name: backdoor Estimand expression: d ──────────(Expectation(y_factual|x1,x7,x16,x6,x19,x17,x21,x14,x25,x10,x9,x12,x dtreatment 2,x5,x24,x15,x3,x20,x8,x23,x4,x22,x18,x11,x13)) Estimand assumption 1, Unconfoundedness: If U→treatment and U→y_factual then P(y_factual|treatment,x1,x7,x16,x6,x19,x17,x21,x14,x25,x10,x9,x12,x2,x5,x24,x15,x3,x20,x8,x23,x4,x22,x18,x11,x13,U) = P(y_factual|treatment,x1,x7,x16,x6,x19,x17,x21,x14,x25,x10,x9,x12,x2,x5,x24,x15,x3,x20,x8,x23,x4,x22,x18,x11,x13) ## Realized estimand b: y_factual~treatment+x1+x7+x16+x6+x19+x17+x21+x14+x25+x10+x9+x12+x2+x5+x24+x15+x3+x20+x8+x23+x4+x22+x18+x11+x13 ## Estimate Value: 3.928671750872714 ## Statistical Significance p-value: <0.001 Causal Estimate is 3.928671750872714 ATE 4.021121012430829
3.2 Using Propensity Score Matching 3.2使用倾向评分匹配
[6]:
estimate = model.estimate_effect(identified_estimand,
method_name="backdoor.propensity_score_matching"
)
print("Causal Estimate is " + str(estimate.va