I recently conducted a survey within an IT company concering user satisfaction with a specific data management solution. There was one question about the overall satisfaction (dependend variable for my regression). And then various questions about more specific aspects like data quality etc. (independend variables in my regression).
With the help of R, I created a multivariate regression in order to figure out which of the various aspects are most important for the customer satisfaction. However, I believe my results are not 100% correct since some of the results dont make sense. For instance, according to the standardized coeffizient increasing data quality results in less user satisfaction. From my point of view, the coefficient should be positive for all variables.
Maybe somebody here can help me/ give me some tips how to improve my model. Down below you can find my code and the results (anonymized). The rows labeled M-AV are my independend variables. In the columns to the right you can find the standardized coefficent, the standard error, t value and p-value.
#https://www.youtube.com/watch?v=EUbujtw5Azc
#Librarys laden
library(lmtest)
library(car)
library(sandwich)
#Daten einlesen
daten <- read.csv(file.choose(), header = T, sep=";")
#Spalte K transformieren (wird als chr erkannt, ist aber numeric)
daten <- transform(daten, K = as.numeric(K))
str(daten)
#Regressions Modell
#modell <- lm(H ~ M + N + O + P + X + Y + Z + AA + AB + AE + AF + AG + AJ + AL + AM + AN + AQ + AR + AS + AU + AV, daten)
modell <- lm(C ~ M + N + O + P + X + Y + Z + AA + AB + AE + AF + AG + AJ + AL + AM + AN + AQ + AR + AS + AU + AV, daten)
#Vorraussetzungen
# 1 Normalverteilung der Residuen
#Plot Punkte sollten ca. auf Linie liegen (entspricht Normalverteilung). Abweichung am Anfang und Ende ist OK.
plot(modell, 2)
# 2a Homoskedastizität (Streuen Residuen gleich)
plot(modell, 1) #sollte ca. auf Ideallinie liegen
#Breusch-Pagan Test, Null-Hypothese: es liegt Homoskedastizität vor
#falls p-value > 0.05 wird Nullhypothese beibehalten
bptest(modell)
#3 Keine Multikollinarität (unabhängige Variablen korrelieren zu stark)
#Vif sollte auf jeden Fall unter 10 liegen, konservativer unter 6
vif(modell)
#4 Ausreißer/ Einflussreiche Fälle
#https://bjoernwalther.com/cook-distanz-in-r-ermitteln-und-interpretieren-ausreisser-erkennen/
plot(modell, 4)
#Robuste Standardfehler
coeftest(modell, vcov=vcovHC(modell, type ="HC3"))
#Auswertung
summary(modell)
#F-Statistik hat Nullhypothese, das Erklärungsmodell kein Erklärungsbeitrag leistet --> hier <.05, wird also verworfen!
#R2 Wert --> ca. 60% der Variable wird durch Variabeln erklärt (eigentlich 40%, siehe ajustiertes R2)
#standartisierte Koeffizienten um einflussreichste Variable zu finden
zmodell <- lm(scale(C) ~ scale(M)+ scale(N) + scale(O) + scale(P) + scale(X) + scale(Y) + scale(Z) + scale(AA) + scale(AB) + scale(AE) + scale(AF) + scale(AG) + scale(AJ) + scale(AL) + scale(AM) + scale(AN) + scale(AQ) + scale(AR) + scale(AS) + scale(AU) + scale(AV), data = daten)
summary(zmodell)
dput(head(j, 20))
structure(list(A = c(6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L), B = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), C = c(10L, 5L, 9L, 9L, 7L, 10L, 10L, 5L, 10L, 8L,
1L, 8L, 10L, 7L, 8L, 10L, 8L, 2L, 8L, 3L), D = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), E = c(5L, 3L, 4L, 5L, 4L, 4L, 6L, 3L, 5L, 3L, 4L, 2L, 4L,
2L, 3L, 5L, 3L, 4L, 3L, 2L), F = c(5L, 2L, 6L, 5L, 4L, 2L, 6L,
4L, 5L, 6L, 4L, 4L, 6L, 5L, 5L, 6L, 4L, 3L, 5L, 5L), G = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), H = c(6L, 3L, 5L, 4L, 5L, 4L, 5L, 4L, 5L, 4L, 2L,
5L, 5L, 4L, 4L, 6L, 4L, 5L, 4L, 1L), I = c(6L, 2L, 5L, 4L, 4L,
4L, 5L, 3L, 5L, 4L, 2L, 5L, 5L, 3L, 4L, 5L, 3L, 2L, 4L, 1L),
J = c(3L, 6L, 6L, 5L, 6L, 2L, 5L, 4L, 6L, 6L, 5L, 2L, 5L,
5L, 2L, 6L, 5L, 5L, 6L, 6L), K = c(5, 3.67, 5.33, 4.33, 5,
3.33, 5, 3.67, 5.33, 4.67, 3, 4, 5, 4, 3.33, 5.67, 4, 4,
4.67, 2.67), L = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), M = c(4L, 2L, 6L,
6L, 5L, 6L, 6L, 4L, 6L, 6L, 5L, 6L, 5L, 5L, 5L, 6L, 6L, 6L,
6L, 3L), N = c(6L, 5L, 5L, 5L, 6L, 6L, 6L, 5L, 6L, 6L, 4L,
4L, 4L, 3L, 5L, 5L, 4L, 5L, 5L, 2L), O = c(5L, 1L, 5L, 4L,
6L, 6L, 5L, 2L, 6L, 6L, 1L, 5L, 5L, 3L, 4L, 5L, 4L, 2L, 5L,
3L), P = c(6L, 1L, 4L, 4L, 4L, 6L, 6L, 2L, 5L, 3L, 2L, 5L,
5L, 3L, 5L, 5L, 4L, 5L, 2L, 1L), Q = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), R = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), S = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), T = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), U = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), V = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), W = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), X = c(4L, 1L, 3L, 4L, 5L, 6L, 5L, 3L, 5L, 4L, 1L, 5L,
4L, 1L, 4L, 1L, 5L, 2L, 4L, 1L), Y = c(5L, 1L, 3L, 3L, 3L,
6L, 5L, 2L, 6L, 4L, 1L, 3L, 4L, 1L, 5L, 5L, 3L, 2L, 3L, 2L
), Z = c(5L, 1L, 3L, 4L, 3L, 6L, 5L, 2L, 5L, 4L, 2L, 3L,
5L, 3L, 5L, 3L, 2L, 1L, 4L, 1L), AA = c(6L, 4L, 4L, 5L, 5L,
6L, 5L, 3L, 4L, 5L, 3L, 4L, 4L, 3L, 5L, 6L, 5L, 3L, 6L, 2L
), AB = c(6L, 6L, 4L, 4L, 3L, 6L, 5L, 3L, 5L, 3L, 2L, 6L,
5L, 6L, 5L, 5L, 5L, 5L, 6L, 2L), AC = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), AD = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), AE = c(5L, 1L, 6L, 4L, 6L,
5L, 4L, 3L, 5L, 5L, 2L, 2L, 4L, 1L, 5L, 3L, 3L, 4L, 4L, 1L
), AF = c(4L, 1L, 6L, 2L, 5L, 5L, 4L, 3L, 6L, 4L, 2L, 4L,
5L, 4L, 5L, 4L, 3L, 4L, 6L, 2L), AG = c(4L, 1L, 5L, 2L, 5L,
5L, 4L, 4L, 4L, 4L, 2L, 4L, 5L, 5L, 4L, 2L, 3L, 2L, 6L, 2L
), AH = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), AI = c(0L, 0L, 1L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L
), AJ = c(3L, 2L, 5L, 3L, 4L, 4L, 6L, 3L, 5L, 5L, 2L, 5L,
5L, 3L, 5L, 5L, 4L, 2L, 5L, 1L), AK = c(NA, NA, 5L, 3L, 4L,
4L, 5L, NA, 6L, 5L, NA, NA, 6L, NA, NA, NA, 4L, NA, NA, NA
), AL = c(4L, 4L, 6L, 4L, 6L, 5L, 5L, 3L, 6L, 5L, 4L, 6L,
5L, 3L, 5L, 4L, 5L, 3L, 6L, 1L), AM = c(5L, 1L, 6L, 4L, 5L,
2L, 4L, 2L, 6L, 4L, 2L, 2L, 6L, 1L, 5L, 3L, 2L, 1L, 4L, 3L
), AN = c(1L, 1L, 6L, 3L, 2L, 6L, 4L, 1L, 6L, 2L, 1L, 4L,
5L, 2L, 5L, 5L, 4L, 4L, 5L, 1L), AO = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), AP = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), AQ = c(3L, 1L, 6L, 3L, 6L,
1L, 5L, 2L, 6L, 5L, 6L, 3L, 6L, 1L, 5L, 3L, 2L, 2L, 4L, 2L
), AR = c(1L, 4L, 4L, 3L, 6L, 1L, 5L, 1L, 6L, 5L, 5L, 4L,
6L, 2L, 5L, 4L, 2L, 2L, 4L, 2L), AS = c(1L, 1L, 6L, 4L, 6L,
1L, 5L, 3L, 6L, 5L, 6L, 5L, 6L, 5L, 5L, 5L, 4L, 2L, 5L, 2L
), AT = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), AU = c(5L, 3L, 4L, 4L, 6L,
3L, 5L, 3L, 6L, 5L, 4L, 4L, 4L, 6L, 5L, 6L, 5L, 6L, 5L, 2L
), AV = c(6L, 3L, 5L, 4L, 6L, 2L, 6L, 2L, 6L, 4L, 4L, 4L,
4L, 6L, 4L, 6L, 3L, 6L, 2L, 3L), AW = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), AX = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), AY = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), AZ = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), BA = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), BB = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), BC = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), BD = c(5.25, 2.25, 5, 4.75, 5.25, 6, 5.75, 3.25, 5.75,
5.25, 3, 5, 4.75, 3.5, 4.75, 5.25, 4.5, 4.5, 4.5, 2.25),
BE = c(5.2, 2.6, 3.4, 4, 3.8, 6, 5, 2.6, 5, 4, 1.8, 4.2,
4.4, 2.8, 4.8, 4, 4, 2.6, 4.6, 1.6), BF = c(4.333333333,
1, 5.666666667, 2.666666667, 5.333333333, 5, 4, 3.333333333,
5, 4.333333333, 2, 3.333333333, 4.666666667, 3.333333333,
4.666666667, 3, 3, 3.333333333, 5.333333333, 1.666666667),
BG = c(3.25, 2, 5.75, 3.5, 4.25, 4.25, 4.75, 2.25, 5.75,
4, 2.25, 4.25, 5.25, 2.25, 5, 4.25, 3.75, 2.5, 5, 1.5), BH = c(1.666666667,
2, 5.333333333, 3.333333333, 6, 1, 5, 2, 6, 5, 5.666666667,
4, 6, 2.666666667, 5, 4, 2.666666667, 2, 4.333333333, 2),
BI = c(5.5, 3, 4.5, 4, 6, 2.5, 5.5, 2.5, 6, 4.5, 4, 4, 4,
6, 4.5, 6, 4, 6, 3.5, 2.5)), row.names = c(NA, 20L), class = "data.frame")