Rを通じて統計学を学ぶ備忘録ブログ

SPSSからRに移行したい私のような人向けのR解説ブログ兼学習用備忘録。

ちょっとした備忘録のページ。

x <- c(4, 1, 2, 3, 1, 2, 4, 2, 2, 1, 3, 1, 3, 2)
Eng <- c("A", "B", "C", "D")
y <- rep(NA, length(x))
for(i in 1:4){
  y[x==i] <- Eng[i]
}
data.frame(x = x, y = y)
set.seed(71)
dat <- matrix(floor(runif(30, min = 0,max = 10)), ncol = 3)
dat <- as.data.frame(dat)
colnames(dat) <- c("a", "b", "c")
dat

> dat
   a b c
1  3 1 6
2  5 7 2
3  3 1 7
4  2 7 2
5  3 8 1
6  9 7 4
7  6 1 4
8  8 8 3
9  3 8 2
10 4 9 8


#a列で重複している行を削除
dat[!duplicated(dat$a), ]

   a b c
1  3 1 6
2  5 7 2
4  2 7 2
6  9 7 4
7  6 1 4
8  8 8 3
10 4 9 8

#b,c列の組み合わせで重複している行を削除
dat[!duplicated(paste(dat$b, dat$c, sep = ",")), ]

   a b c
1  3 1 6
2  5 7 2
3  3 1 7
5  3 8 1
6  9 7 4
7  6 1 4
8  8 8 3
9  3 8 2
10 4 9 8
options(scipen = 5)

##想定クリック数1000回、クリック率1%の時の二項分布。
d <- as.data.frame(dbinom(x = 0:30, size = 1000, prob = 0.01))
d$x <- 0:30
colnames(d) <- c("y","x")
ggplot(data = d, aes(x = factor(x), y = y))+ geom_bar(stat = "identity")

##平均λ=1.43のポアソン分布。7時間で10cvの場合。
d <- as.data.frame(dpois(x = 0:10, lambda = 1.428))
d$x <- 0:10
colnames(d) <- c("y","x")
ggplot(data = d, aes(x = factor(x), y = y))+ geom_bar(stat = "identity")

参照元http://www.mm-lab.jp/analysis/data-analysis-start-with-excel-5th/

【オンライン広告の認知効果】

x <- 1:1000
y <- 5.980469*x^0.265342 + rnorm(x, 0, 5)
d <- data.frame(x,y)
plot(x,y)
fit <- lm(log(y) ~ log(x))
summary(fit)

result.con <- data.frame(x = d$x, conf = exp(predict(fit, interval="confidence", level = 0.90)))

ggplot(d, aes(x = x, y = y)) + geom_point() +
  geom_ribbon(data = result.con, aes(ymin=result.con$conf.lwr, ymax = result.con$conf.upr), fill = "darkgreen", alpha=0.2) +
  geom_line(data = result.con, aes(x = x, y = conf.fit)) 

参照元http://www.videor.co.jp/press/2015/150903.pdf

【重回帰分析の信頼区間作図】

m1071 <- lm(ACHIEVE ~ STUDY + FACULTY + STUDY:FACULTY, data=df1)

mean(df1$FACULTY)
dummy0 <- 0

min(df1$STUDY) #xの範囲を知るため
max(df1$STUDY) #xの範囲を知るため
xindex <- seq(4, 10, 1) #動かす変数の範囲


#動かす変数以外に平均値を放り込む
conf <- predict(m1071, 
                data.frame(STUDY = xindex, 
                           FACULTY = dummy0),
                interval="confidence") 

conf <- as.data.frame(conf)
conf$STUDY <- seq(4, 10, 1)

ggplot() + 
  geom_line(data = conf, aes(STUDY, fit)) +
  geom_ribbon(data = conf, aes(STUDY, ymin=lwr, ymax=upr), alpha=1/2) +
  geom_point(data = df1, aes(STUDY, ACHIEVE))

【重回帰分析の交互作用の±1SD】

m1071 <- lm(ACHIEVE ~ STUDY + INTEREST + STUDY:INTEREST, data=df1)

mean <- mean(df1$INTEREST)
mean_upp_1sd <- mean(df1$INTEREST) + sd(df1$INTEREST)
mean_low_1sd <- mean(df1$INTEREST) - sd(df1$INTEREST)

min(df1$STUDY) #xの範囲を知るため
max(df1$STUDY) #xの範囲を知るため
xindex <- seq(4, 10, 1) #動かす変数の範囲


#動かす変数以外に平均値を放り込む
conf <- predict(m1071, 
                data.frame(STUDY = xindex, 
                           INTEREST = mean),
                interval="confidence") 

conf <- as.data.frame(conf)
conf$STUDY <- seq(4, 10, 1)
##----------------------------
conf_upp <- predict(m1071, 
                  data.frame(STUDY = xindex, 
                             INTEREST = mean_upp_1sd),
                  interval="confidence") 
conf_upp <- as.data.frame(conf_upp)
conf_upp$STUDY <- seq(4, 10, 1)
##----------------------------
conf_low <- predict(m1071, 
                    data.frame(STUDY = xindex, 
                               INTEREST = mean_low_1sd),
                    interval="confidence") 
conf_low <- as.data.frame(conf_low)
conf_low$STUDY <- seq(4, 10, 1)
##----------------------------

ggplot() + 
  geom_line(data = conf, aes(STUDY, fit)) +
  geom_line(data = conf_upp, aes(STUDY, fit), colour="red",linetype="dashed") +
  geom_line(data = conf_low, aes(STUDY, fit), colour="blue",linetype="dashed") +
  geom_point(data = df1, aes(STUDY, ACHIEVE))

predcit関数

n <- 5
a <- 2
b <- 2

x <- rnorm(n, 10, 1)
e <- rnorm(n, 0, 1)
y <- a + b*x + e

plot(x,y)
model <- lm(y~x)
model

test <- data.frame(x = seq(min(x), max(x), 0.5))
A <- predict(object = model,
             newdata = newdata,
             se.fit = TRUE,
             interval = "confidence",
             levels = 0.95)

##predict関数による信頼区間
lines(test[,1], A$fit[,1])
lines(test[,1], A$fit[,2], col = "red", lty = "dotdash")
lines(test[,1], A$fit[,3], col = "red", lty = "dotdash")

##自作による信頼区間
test <- data.frame(x = seq(min(x), max(x), 0.5), predcit = A$fit)
test$self.lwr <- A$fit[,1] - A$se.fit * qt(0.975, A$df)
test$self.upr <- A$fit[,1] + A$se.fit * qt(0.975, A$df)
lines(test[,1], test[,5], col = "blue", lty = "dotted")
lines(test[,1], test[,6], col = "blue", lty = "dotted")

##予測区間
A$fit[,1] - sqrt(A$se.fit^2 + A$residual.scale^2)*qt(0.975, A$df)
A$fit[,1] + sqrt(A$se.fit^2 + A$residual.scale^2)*qt(0.975, A$df)
A <- predict(object = model,
             newdata = newdata,
             se.fit = TRUE,
             interval = "predicion",
             levels = 0.95)