An Introducation to R

Motivation

R is a statistical programming environment with many built-in mathematical functions and many others that are found in packages that can be installed. RRPP is one such package.

Analyses in R are performed using a series of commands which are written in script files and passed to the main console. The general workflow is:

1: OPEN R
2: OPEN R-script
3: change to working directory
4: run analyses

1: Some basic commands

Like any programming language, one must learn its syntax. Over time one learns the commands in R, and how to string them together into meaningful operations. Here are some basic commands:

3->a  #Assign value to variable
a

## [1] 3

b<-c(3,4,5)   # 'c' combines values into vector or list
b

## [1] 3 4 5

b[2]    #access items in vectors by calling their position

## [1] 4

a <- rnorm(50) #generate random normal vector
b <- rnorm(a)
plot(a, b)      # a simple plot

c<-cbind(a,b)    #binds columns together (rbind does same by rows)
c

##                 a            b
##  [1,] -1.59059789  0.131686082
##  [2,]  1.63767509  0.697247333
##  [3,] -0.46031947 -1.530144463
##  [4,] -0.27957645 -1.061497956
##  [5,]  1.07438978 -0.001245916
##  [6,] -1.43820044  0.038591686
##  [7,]  0.65750517 -1.021081678
##  [8,]  0.61814024 -0.619579484
##  [9,]  1.58640202 -0.017613066
## [10,] -0.52055923  1.481115730
## [11,] -0.82545131  1.856718494
## [12,] -1.81085213 -0.016228555
## [13,] -0.25295858 -1.527810672
## [14,]  0.78769691  0.486017620
## [15,] -0.42293857  0.346873502
## [16,] -0.61606921 -0.253465134
## [17,] -0.78744488  0.479667075
## [18,]  0.78492840 -0.199368869
## [19,] -0.63787926 -1.515632154
## [20,] -0.24755767  1.039194128
## [21,]  1.35811638  0.800355902
## [22,]  0.62690586 -0.370925759
## [23,]  1.12208067  1.525413714
## [24,] -0.08769722 -0.241977288
## [25,] -0.64960246  0.858010494
## [26,] -2.26377761 -0.410770022
## [27,]  0.88648460  1.860177560
## [28,] -1.20893336  1.027397098
## [29,]  0.11422066  0.121873051
## [30,] -1.02622540 -0.698113415
## [31,]  0.74655256 -0.104293570
## [32,]  0.79261523  0.376735663
## [33,]  1.03624273  0.244737180
## [34,] -0.96000164  0.071130810
## [35,]  0.98725298  1.257791969
## [36,] -2.31406591  0.835467418
## [37,]  0.34722426  0.678026967
## [38,] -0.90743941  0.654504306
## [39,] -1.41237401  1.851547614
## [40,] -0.29614991  0.053869393
## [41,]  1.13145140 -1.066963133
## [42,] -1.35349650 -0.671970251
## [43,] -1.25094480  0.059124091
## [44,] -1.26637452 -0.939270824
## [45,] -0.35000401 -1.023622961
## [46,]  0.46002816  0.812502534
## [47,] -0.23381900  3.158123282
## [48,] -1.27727089  0.233459432
## [49,]  0.07240996 -1.035101133
## [50,]  0.68826553  1.538580092

c[1]   #first element

## [1] -1.590598

c[1,]    #first row

##          a          b 
## -1.5905979  0.1316861

c[,1]  #first column

##  [1] -1.59059789  1.63767509 -0.46031947 -0.27957645  1.07438978 -1.43820044
##  [7]  0.65750517  0.61814024  1.58640202 -0.52055923 -0.82545131 -1.81085213
## [13] -0.25295858  0.78769691 -0.42293857 -0.61606921 -0.78744488  0.78492840
## [19] -0.63787926 -0.24755767  1.35811638  0.62690586  1.12208067 -0.08769722
## [25] -0.64960246 -2.26377761  0.88648460 -1.20893336  0.11422066 -1.02622540
## [31]  0.74655256  0.79261523  1.03624273 -0.96000164  0.98725298 -2.31406591
## [37]  0.34722426 -0.90743941 -1.41237401 -0.29614991  1.13145140 -1.35349650
## [43] -1.25094480 -1.26637452 -0.35000401  0.46002816 -0.23381900 -1.27727089
## [49]  0.07240996  0.68826553

rbind(a,b)

##         [,1]      [,2]       [,3]       [,4]         [,5]        [,6]
## a -1.5905979 1.6376751 -0.4603195 -0.2795765  1.074389784 -1.43820044
## b  0.1316861 0.6972473 -1.5301445 -1.0614980 -0.001245916  0.03859169
##         [,7]       [,8]        [,9]      [,10]      [,11]       [,12]
## a  0.6575052  0.6181402  1.58640202 -0.5205592 -0.8254513 -1.81085213
## b -1.0210817 -0.6195795 -0.01761307  1.4811157  1.8567185 -0.01622856
##        [,13]     [,14]      [,15]      [,16]      [,17]      [,18]      [,19]
## a -0.2529586 0.7876969 -0.4229386 -0.6160692 -0.7874449  0.7849284 -0.6378793
## b -1.5278107 0.4860176  0.3468735 -0.2534651  0.4796671 -0.1993689 -1.5156322
##        [,20]     [,21]      [,22]    [,23]       [,24]      [,25]     [,26]
## a -0.2475577 1.3581164  0.6269059 1.122081 -0.08769722 -0.6496025 -2.263778
## b  1.0391941 0.8003559 -0.3709258 1.525414 -0.24197729  0.8580105 -0.410770
##       [,27]     [,28]     [,29]      [,30]      [,31]     [,32]     [,33]
## a 0.8864846 -1.208933 0.1142207 -1.0262254  0.7465526 0.7926152 1.0362427
## b 1.8601776  1.027397 0.1218731 -0.6981134 -0.1042936 0.3767357 0.2447372
##         [,34]    [,35]      [,36]     [,37]      [,38]     [,39]       [,40]
## a -0.96000164 0.987253 -2.3140659 0.3472243 -0.9074394 -1.412374 -0.29614991
## b  0.07113081 1.257792  0.8354674 0.6780270  0.6545043  1.851548  0.05386939
##       [,41]      [,42]       [,43]      [,44]     [,45]     [,46]     [,47]
## a  1.131451 -1.3534965 -1.25094480 -1.2663745 -0.350004 0.4600282 -0.233819
## b -1.066963 -0.6719703  0.05912409 -0.9392708 -1.023623 0.8125025  3.158123
##        [,48]       [,49]     [,50]
## a -1.2772709  0.07240996 0.6882655
## b  0.2334594 -1.03510113 1.5385801

ls() # See which R objects are now in the R workspace.

## [1] "a" "b" "c"

### Some base functions
sum(a)

## [1] -9.231993

mean(a)

## [1] -0.1846399

min(a)

## [1] -2.314066

max(a)

## [1] 1.637675

var(a)

## [1] 1.045132

a^2     #square values

##  [1] 2.530001645 2.681979700 0.211894012 0.078162992 1.154313408 2.068420510
##  [7] 0.432313044 0.382097360 2.516671380 0.270981910 0.681369865 3.279185428
## [13] 0.063988044 0.620466416 0.178877033 0.379541270 0.620069436 0.616112596
## [19] 0.406889954 0.061284799 1.844480094 0.393010963 1.259065026 0.007690803
## [25] 0.421983360 5.124689076 0.785854940 1.461519859 0.013046359 1.053138577
## [31] 0.557340723 0.628238909 1.073799004 0.921603145 0.974668452 5.354901030
## [37] 0.120564687 0.823446290 1.994800343 0.087704770 1.280182273 1.831952784
## [43] 1.564862886 1.603704437 0.122502804 0.211625908 0.054671324 1.631420917
## [49] 0.005243202 0.473709445

sqrt(a) #NaN for negative values

## Warning in sqrt(a): NaNs produced

##  [1]       NaN 1.2797168       NaN       NaN 1.0365278       NaN 0.8108669
##  [8] 0.7862190 1.2595245       NaN       NaN       NaN       NaN 0.8875229
## [15]       NaN       NaN       NaN 0.8859619       NaN       NaN 1.1653825
## [22] 0.7917739 1.0592831       NaN       NaN       NaN 0.9415331       NaN
## [29] 0.3379655       NaN 0.8640327 0.8902894 1.0179601       NaN 0.9936061
## [36]       NaN 0.5892574       NaN       NaN       NaN 1.0636970       NaN
## [43]       NaN       NaN       NaN 0.6782538       NaN       NaN 0.2690910
## [50] 0.8296177

abs(a)

##  [1] 1.59059789 1.63767509 0.46031947 0.27957645 1.07438978 1.43820044
##  [7] 0.65750517 0.61814024 1.58640202 0.52055923 0.82545131 1.81085213
## [13] 0.25295858 0.78769691 0.42293857 0.61606921 0.78744488 0.78492840
## [19] 0.63787926 0.24755767 1.35811638 0.62690586 1.12208067 0.08769722
## [25] 0.64960246 2.26377761 0.88648460 1.20893336 0.11422066 1.02622540
## [31] 0.74655256 0.79261523 1.03624273 0.96000164 0.98725298 2.31406591
## [37] 0.34722426 0.90743941 1.41237401 0.29614991 1.13145140 1.35349650
## [43] 1.25094480 1.26637452 0.35000401 0.46002816 0.23381900 1.27727089
## [49] 0.07240996 0.68826553

cor(a,b)

## [1] 0.07365987

rm(list=ls())   #remove items in memory

# Matrix operations
a<-matrix(c(1,0,4,2,-1,1),nrow=3)
b<-matrix(c(1,-1,2,1,1,0),nrow=2)
a

##      [,1] [,2]
## [1,]    1    2
## [2,]    0   -1
## [3,]    4    1

##      [,1] [,2] [,3]
## [1,]    1    2    1
## [2,]   -1    1    0

c<-t(a) #matrix transpose
a

##      [,1] [,2]
## [1,]    1    2
## [2,]    0   -1
## [3,]    4    1

##      [,1] [,2] [,3]
## [1,]    1    0    4
## [2,]    2   -1    1

2*a #scalar multiplication

##      [,1] [,2]
## [1,]    2    4
## [2,]    0   -2
## [3,]    8    2

#Matrix addition and subtraction
b+c

##      [,1] [,2] [,3]
## [1,]    2    2    5
## [2,]    1    0    1

b-c

##      [,1] [,2] [,3]
## [1,]    0    2   -3
## [2,]   -3    2   -1

a+b  ##NOTE: non-conformable matrices (check rxc of your matrices!)

## Error in a + b: non-conformable arrays

#elementwise multiplication (hadamard product)
c

##      [,1] [,2] [,3]
## [1,]    1    0    4
## [2,]    2   -1    1

##      [,1] [,2] [,3]
## [1,]    1    2    1
## [2,]   -1    1    0

c*b

##      [,1] [,2] [,3]
## [1,]    1    0    4
## [2,]   -2   -1    0

# matrix multiplication
a%*%b       ## %*% is symbol for matrix multiplication

##      [,1] [,2] [,3]
## [1,]   -1    4    1
## [2,]    1   -1    0
## [3,]    3    9    4

b%*%a       ## matrix order matters

##      [,1] [,2]
## [1,]    5    1
## [2,]   -1   -3

rm(list=ls())


gl(2,10)

##  [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## Levels: 1 2

2: Reading Data

There are many ways to read data into R. Here is one example:

mydata<-read.csv(file="Data/Lab-01-RIntroData.csv",header=T)
mydata

##     x         y        y2 groups
## 1   1  1.521048  0.585666      a
## 2   2  2.686718  2.306728      a
## 3   3  4.708974  1.438889      a
## 4   4  5.024545  4.085192      a
## 5   5  6.010978  4.902465      a
## 6   6  7.998740  6.169760      a
## 7   7  8.352256  5.914856      a
## 8   8  9.333044  5.300719      a
## 9   9  9.287240  3.658625      a
## 10 10 12.259109 10.230400      a
## 11 11 13.079777  8.241945      b
## 12 12 13.156969  9.868003      b
## 13 13 13.833857  4.100591      b
## 14 14 14.484650  6.877223      b
## 15 15 15.199484 13.079567      b
## 16 16 18.993665 15.629931      b
## 17 17 18.989006 13.538135      b
## 18 18 19.231329  5.108227      b
## 19 19 21.568282  6.446203      b
## 20 20 22.978398 19.427795      b

Y<-as.matrix(mydata[,(2:3)])
FactorA<-as.factor(mydata[,4])
Y

##               y        y2
##  [1,]  1.521048  0.585666
##  [2,]  2.686718  2.306728
##  [3,]  4.708974  1.438889
##  [4,]  5.024545  4.085192
##  [5,]  6.010978  4.902465
##  [6,]  7.998740  6.169760
##  [7,]  8.352256  5.914856
##  [8,]  9.333044  5.300719
##  [9,]  9.287240  3.658625
## [10,] 12.259109 10.230400
## [11,] 13.079777  8.241945
## [12,] 13.156969  9.868003
## [13,] 13.833857  4.100591
## [14,] 14.484650  6.877223
## [15,] 15.199484 13.079567
## [16,] 18.993665 15.629931
## [17,] 18.989006 13.538135
## [18,] 19.231329  5.108227
## [19,] 21.568282  6.446203
## [20,] 22.978398 19.427795

FactorA

##  [1] a a a a a a a a a a b b b b b b b b b b
## Levels: a b

3: Other Functions: the apply family

A useful set of functions are the ‘apply’ family of functions. These perform some task repetitively in some way over a data structure. A few simple examples follow:

apply(Y,2,sd)    #here, we obtain the std for each column of a matrix

##        y       y2 
## 6.334554 4.929332

tapply(Y[,1],FactorA,mean)  #Obtain means for first column for levels of FactorA

##         a         b 
##  6.718265 17.151542

tapply(Y[,2],FactorA,mean)  #Obtain means for first column for levels of FactorA

##        a        b 
##  4.45933 10.23176

tapply(Y,FactorA,mean)        #Try entire matrix: doesn't work

## Error in tapply(Y, FactorA, mean): arguments must have same length

rowsum(Y, FactorA)/as.vector(table(FactorA))

##           y       y2
## a  6.718265  4.45933
## b 17.151542 10.23176

4: Building Functions

One can also make your own function. These are defined as function(xx) where xx defines the input types (there may be more than one). Below is a simple example (NOTE: a good way to learn the syntax of functions is to type the name of the function at the command line without (). This will then show the underlying code of the function):

mymean<-function(x){
  n<-length(x)
  tmp<-0
  for (i in 1:n){
    tmp<-tmp+x[i]
  }
  mn<-tmp/n
  return(mn)
}

x<-rnorm(10)
mean(x)

## [1] 0.6692329

mymean(x)  #works!

## [1] 0.6692329

5: Basic Statistical Models

Because R is a statistical programming language there are many statistical models in R. Here are a few basic ones (we’ll discover many more this semester):

model1<-lm(mydata$y~mydata$x)  #run regression
summary(model1)

## 
## Call:
## lm(formula = mydata$y ~ mydata$x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5180 -0.4477  0.0265  0.6464  1.2133 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.77548    0.36734   2.111    0.049 *  
## mydata$x     1.06280    0.03066  34.659   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7908 on 18 degrees of freedom
## Multiple R-squared:  0.9852, Adjusted R-squared:  0.9844 
## F-statistic:  1201 on 1 and 18 DF,  p-value: < 2.2e-16

anova(model1)  #generates anova table of results

## Analysis of Variance Table
## 
## Response: mydata$y
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## mydata$x   1 751.15  751.15  1201.2 < 2.2e-16 ***
## Residuals 18  11.26    0.63                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#A plot with regression line
plot(mydata$x,mydata$y)
abline(coef(model1))

model2<-lm(mydata$y~mydata$groups)  #run anova
summary(model2)

## 
## Call:
## lm(formula = mydata$y ~ mydata$groups)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1972 -2.8296  0.2866  2.2021  5.8269 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.718      1.101   6.103 9.14e-06 ***
## mydata$groupsb   10.433      1.557   6.702 2.77e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.481 on 18 degrees of freedom
## Multiple R-squared:  0.7139, Adjusted R-squared:  0.698 
## F-statistic: 44.91 on 1 and 18 DF,  p-value: 2.767e-06

anova(model2)

## Analysis of Variance Table
## 
## Response: mydata$y
##               Df Sum Sq Mean Sq F value    Pr(>F)    
## mydata$groups  1 544.27  544.27  44.911 2.767e-06 ***
## Residuals     18 218.14   12.12                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1