I am trying to create a sample data set (most of the code is from this question). It is almost how I want it to be. However, there are two things I still want to do, but I cannot figure out.
I would like to create a higher correlation between
y
andyear
, without rearranging the whole data set (so by only changing the values of y).If possible (I currently just manually changed the
set.seed()
until I got a significant relation), I would like to be able to determine the true correlation between theevent
andy
. (again only y can be changed).
Could someone help me with explaining how to do this?
set.seed(2)
a <- 2 # structural parameter of interest
b <- 1 # strength of instrument
rho <- 0.5 # degree of endogeneity
N <- 1000
z <- rnorm(N)
res1 <- rnorm(N)
res2 <- res1*rho + sqrt(1-rho*rho)*rnorm(N)
x <- z*b + res1
ys <- x*a + res2
d <- (ys>0) #dummy variable
y <- round(10-(d*ys))
random_variable <- rnorm(100, mean = 0, sd = 1)
library(data.table)
DT_1 <- data.frame(y,x,z, random_variable)
DT_2 <- structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50), year = c(1995, 1995, 1995, 1995, 1995,
1995, 1995, 1995, 1995, 1995, 2000, 2000, 2000, 2000, 2000, 2000,
2000, 2000, 2000, 2000, 2005, 2005, 2005, 2005, 2005, 2005, 2005,
2005, 2005, 2005, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010,
2010, 2010, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015), Group = c("A", "A", "A", "A", "B", "B", "B", "B", "C",
"C", "A", "A", "A", "A", "B", "B", "B", "B", "C", "C", "A", "A",
"A", "A", "B", "B", "B", "B", "C", "C", "A", "A", "A", "A", "B",
"B", "B", "B", "C", "C", "A", "A", "A", "A", "B", "B", "B", "B",
"C", "C"), event = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), win_or_lose = c(-1,
-1, -1, -1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 1, 1, 1, 1, 0, 0,
-1, -1, -1, -1, 1, 1, 1, 1, 0, 0)), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
DT_1 <- setDT(DT_1)
DT_2 <- setDT(DT_2)
DT_2 <- rbind(DT_2 , DT_2 [rep(1:50, 19), ])
sandbox <- cbind(DT_1, DT_2)