################################################################ Introduction to R ############################################# Simple Calculation 2+3 2-3 2+3 2/3 sqrt(2) ## sqrt(x) computes the sqaure root of x abs(-2) 2==3 ## Double equal sign is a logical operator to test whether objects on both sides are equal ## The return result is FALSE or TRUE sqrt(2)*sqrt(2) == 2 ## Oops! R can't recognize thy are equal ## Additional Supporting Material: #1. Appendix G "Computational Precision and Floating Point Arithmetic", pages 753-771 #of Statistical Analysis and Data Display: An Intermediate Course with Examples in R, #Richard M. Heiberger and Burt Holland (Springer 2015, second edition). # http://link.springer.com/content/pdf/bbm%3A978-1-4939-2122-5%2F1.pdf. #2.David Goldberg (1991), "What Every Computer Scientist Should Know About Floating-Point Arithmetic", #ACM Computing Surveys, 23/1, 5-48. #http://www.validlab.com/goldberg/paper.pdf. all.equal(sqrt(2)*sqrt(2),2) ## In order to fix this porblem, we can use all.equal ############################################# Simple Manipulation ### Getting Help ?mean help(mean) help.start() ## Start the HTML version of R's online documentation help.search('me') ## searching the help system for documentation matching a given character string in the (file) ##name,title,concept or keyword entries. ### Quit R quit() ############################################## Objects in R ### How to give a value to a variable x<-123 y="happy" 1234->z w="happytwo" ## Double quote marks and single quote mark have the same function ### Structures of Obejcts ## Double and integer: #They are the two most common numeric classes used in R #R automatically converts between these two classes when needed for mathematical purposes. #As a result, it's feasible to use R and perform analyses for years without specifying these differences. x=c(1,2,3) typeof(x) ## Determines the type of one obejct x=c(1L,2L,3L) ## L suffix set the value 1 to be an integer typeof(x) ## Charatcer: y='happy' typeof(y) mode(y) ## list: w=list(22,'happy',TRUE) typeof(w) mode(w) ## Logical Value: zz=TRUE ww=T xx=True # Error----we can't use True to present logical value typeof(zz) typeof(ww) ### Change the Type of Obejcts: x=1.2345 typeof(x) as.integer(x) typeof(as.integer(x)) # The type changes from double to integer ########################## Sequences num=1:8 numa=seq(from = 1, to = 8, by = 1) numb=seq(1,8,1) numc=seq(8) ## sames as 1:8 num2=seq(1,20,length = 25) num3=rep(1:4, 3) num4=rep(c(3,5,7), 2) num5=rep(1:5, 5:1) ########################## List ## Organizing Data: List Class1=list(dept="STAT", number = "1234", title = "Introduction to R", enroll = 100,studentgroup= c(1,1,2,3,4,2,3,4,3,2,3,1)) mode(Class1) is.list(Class1) ## Specific a particular element in a list Class1[3] Class1[[3]] class(Class1[3])# single bracket returns a list element class(Class1[[3]]) # double brackets return a list of a single element Class1$title Class1$studentgroup Class1$studentgroup[4] ###################### Vector and Matrix ## Organizing Data: Vector---one-dimensional matrix Age=c(22,25,32,40,24,20,27,28) Sex=c("Male","Female","Female","Male","Female","Male","Male","Female") Name=c('Alex','Nancy','Susan','Jackson','Vivan','David','John','Mary') class(Age) Age[1] Age[Sex=='Male'] # Logical Indexing in Vector Sex=='Male' Age>20 Name[Age>20&Sex=='Male']## & mean and Name[Age<18| Age>38] ## | means or Name[Sex!='Male'] ## != means not equal #The %in% function goes through every value in the vector x, and returns TRUE #if it finds it in the vector of possible values - otherwise it returns FALSE. #x %in% vector of possible values Name[Age %in% c(22,28)] # Basic functions of Vector X=c(11,22,10,2333,221,666,34,2131,33) sort(X) sort(X,decreasing = TRUE) mean(X) var(X) max(X) min(X) median(X) summary(X) ## Organizing Data: Matrix matrix(1:12, nrow = 4, ncol = 3,byrow=T) matrix(1:12, nrow = 4, ncol = 3,byrow=FALSE) matrix(1:13, 4, 3) ## Check the warning message matrix(1:13, 4, 5) ## Check the warning message mat=matrix(1:12, nrow = 4, ncol = 3,byrow=T) dim(mat) mat2=matrix(1:12, nrow = 4, ncol = 3,byrow=FALSE) rbind(mat, mat2) cbind(mat, mat2) kids <- matrix(c(42, 66, 41, 63, 41, 53, 74, 51, 48, 68), ncol = 2, byrow = T) dimnames(kids) <- list(c("Tom", "John", "Jack", "Mary", "Susan"), c("Height", "Weight")) attributes(kids) ## Access an object's attributes m1=matrix(1:6, nrow=2, ncol=3) m2=matrix(7:12, nrow=3, ncol=2) m1%*%m2 ## inner product m1%o%m2 ## outer product v1=c(1, 3, -5) v2=c(4, -2, -1) v1%o%v2 diag(v1%o%v2) ######################## Factor directions=c("North", "East", "South", "West") directions.factor=factor(directions) directions.factor Blood_pres=seq(80,250,by=6) B_fac=cut(Blood_pres,breaks=c(79.9,120,200,250),labels=c("H","A","D")) ## (a,b] left open and right closed subset ## cut convert numeric to factor levels(B_fac)=c("Healthy","Attention","Dangeous") B_fac B_fac=ordered(B_fac,levels=c('Dangeous',"Attention","Healthy")) B_fac ## convert factor to character gender=c(rep("male",20), rep("female", 30)) gender_factor=as.factor(gender) levels(gender_factor)=c("F","M") gender_factor gender_re=as.character(gender_factor) ###################### Data Frames ## Organizing Data: Data Frames ## Compared with matrix, data frames are more generally and could contain different types of data. kiddata=data.frame(kids) attributes(kiddata) kiddata$Height kiddata$Weight kids$Height ## This does not work for matrix # To show the difference between data frame and matrix team=c("Bears", "Eagles", "Dolphins") wins=c(5,11,8) losses=c(9,3,6) football=data.frame(team, wins, losses) football_2=cbind(team,wins,losses) # Dataframes functions library(help='datasets') ?ChickWeight ## A built-in datasets for weight versus age of chicks on different diets head(ChickWeight) head(ChickWeight,n=4) tail(ChickWeight,n=5) View(ChickWeight) ## See the dataframe in a new window nrow(ChickWeight) ncol(ChickWeight) dim(ChickWeight) rownames(ChickWeight) colnames(ChickWeight) names(ChickWeight) summary(ChickWeight) ChickWeight$weight table(ChickWeight$Diet) ## It returns a table of the counts at each combination of factor levels. # Add a new column on one dataframe survey=data.frame("index" = c(1001, 1002, 1003, 1004, 1005), "age" = c(24, 25, 42, 56, 22)) survey$sex=c('m','m','f','f','m') survey # A new column named 'sex' comes out # Slice a dataframe survey[1,] survey[,'index'] survey[1,2] survey[survey$sex=='m',] # Slicing with logical values survey[survey$sex=='m'&survey$age<24,] subset(survey,subset = sex=='m',select=index) subset(survey,subset=sex=='m'&age<23) data("airquality") subset(airquality, Temp > 80, select = c(Ozone, Temp)) subset(airquality, Day == 1, select = -Temp) subset(airquality, select = Ozone:Wind) ########################################### Working Directory getwd() ## get the working directory setwd("C:/Users/Yiqing/Desktop") ## change the working directory getwd() ls() ##list jects in the working directory ### Save: Save one object "survey" into a new file. save(survey,file='Study1.RData') ## Save.image():If you have many objects that you want to save, #then using save can be tedious as you'd have to type the name of every object. #To save all the objects in your workspace as a .RData file, use the save.image() function. save.image(file='Study2.RData') load(file='Study1.RData') rm(survey) #### Furthermore, we can use write.table() to export data as a simple .txt text file. survey=data.frame("index" = c(1001, 1002, 1003, 1004, 1005), "age" = c(24, 25, 42, 56, 22),'sex'=c('m','m','f','f','m')) write.table(x=survey,file='survey.txt',sep='\t',row.names = FALSE) ##x---The object you want to write to a text file, usually a dataframe ##file=-----The document's file path relative to the working directory unless specified otherwise. #for example file = "mydata.txt" saves the text file directly in the working directory, #while file = "data/mydata.txt" will save the data in an existing folder called data inside the working directory. #You can also specify a full file path outside of your working directory #(file = "/Users/CaptainJack/Desktop/OctoberStudy/mydata.txt") ##sep-----A string indicating how the columns are separated. #For comma separated files, use sep = ",", for tab-delimited files, use sep = "\t" ##row.names--A logical value (TRUE or FALSE) indicating whether or not save the rownames in the text file. #(row.names = FALSE will not include row names) read.table("https://faculty.smu.edu/ngh/stat6304/dataset.txt") ## read.table to help us to read dataset into R read.table("https://faculty.smu.edu/ngh/stat6304/dataset.txt",header=T) read.table("https://faculty.smu.edu/ngh/stat6304/dataset.txt",header=T,na.strings = '.') ## header: A logical value indicating whether the data has a header row #that is, whether the first row of the data represents the column names. ##na.string:a character vector of strings which are to be interpreted as NA values. read.csv("Sales Data.csv",header = T) ######################################### Simple Functions # Area of a circle S=pi*r^2 area_circle=function(r){ pi*r^2 } area_circle(10) # If function Obesity=function(bmi){ if(bmi>24){ return('Attention: Obesity') } else{ return('You are in the healthy range') } } Obesity(25) Obesity(23) ## While function #Sum from 0 to N SumofN=function(N){ value=0 i=0 while(i<=N){ value=value+i i=i+1 } value } SumofN(50) ################################################Simple Plots x <- seq(-3,3, length = 1000) y <- exp(-x*x/2)/sqrt(2*pi) plot(x,y) dev.off() library(MASS) plot(Cars93$MPG.city, type = "p") ## p--points l--lines b--both n--no plotting plot(Cars93$MPG.city, type = "l") plot(1:10,rep(5, 10), type = "l") plot(1:10,rep(5, 10), type = "l", lwd = 2) # lwd---Line width plot(1:10,rep(5, 10), type = "l", lty = 2) # lty---line type plot(1:10,rep(5, 10), type = "l", col = 'red') plot(1:10,rep(5, 10), type = "l", col = '#3F8C41') # Get the color you want ! #http://www.rapidtables.com/convert/color/rgb-to-hex.htm colors() ?plot.default # Add titles plot(Cars93$MPG.city, Cars93$Weight, xlab = "", ylab = "") title(main = "Scatter plot of Cars Weight vs. City MPG", sub = "this is a subtitle", xlab = "City MPG", ylab = "Weight") # Add points heavy <- Cars93$Weight > 3500 plot(Cars93$MPG.city, Cars93$Weight, type = "n") points(Cars93$MPG.city[heavy], Cars93$Weight[heavy], pch = 16, col = 4) points(Cars93$MPG.city[!heavy], Cars93$Weight[!heavy], pch = 5, col = 6) # Add axis plot(Cars93$MPG.city, Cars93$Weight, axes=F) axis(1) axis(2) axis(3) axis(4) box() # change axis plot(Cars93$MPG.city, Cars93$Weight, axes=F) axis(1, at = c(20, 30, 40), labels = c("A", "B", "C")) axis(2, lty = 2, lwd = 3, col = "red", font = 2) ## font: 1=plain, 2=bold, 3=italic, 4=bold-italic #Add text plot(1:10, 1:10, type = "n") text(3,1,"This is font 1", font = 1, cex = 2) ## font: 1=plain, 2=bold, 3=italic, 4=bold-italic 5= Greek letters text(3,2,"This is font 2", font = 2, cex = 2) text(3,3,"This is font 3", font = 3, cex = 2) text(3,4,"This is font 4", font = 4, cex = 2) text(3,5,"This is font 5", font = 5, cex = 2) # Add lines plot(Cars93$MPG.city, Cars93$Weight) lines(c(20, 30), c(2000, 2000)) dev.off() # Box Plot boxplot(Cars93$Price) boxplot(Cars93$Weight~Cars93$DriveTrain, col = 2:4) # Bar plot counts <- table(Cars93$Cylinders) barplot(counts, col = 1:6, xlab = "Number of Cylinders", angle=seq(90,180,len=6), density=20) ##Pie Chart pie(counts, col = rainbow(length(counts))) ##Histogram hist(Cars93$Weight) ##Starplot data(longley) stars(longley[1:16,]) # Faceplot install.packages("aplpack") library(aplpack) faces(longley[1:16,],face.type=0) ## face.type=0---line drawing faces faces(longley[1:16,],face.type=1) ## face.type=1---the elements of the faces are painted #map install.packages('maps') library(maps) map() map("state") ## 3d scatterplot install.packages("scatterplot3d") library(scatterplot3d) scatterplot3d(Cars93$MPG.city, Cars93$Weight, Cars93$Horsepower, pch = 15) ################################# Simple Linear regreesion x=c(18,23,25,35,65,54,34,56,72,19,23,42,18,39,37) y=c(202,186,187,180,156,169,174,172,153,199,193,174,198,183,178) plot(x,y) # make a plot lm1=lm(y~x) summary(lm1) coef(lm1) resid(lm1) fitted(lm1) confint(lm1) predict(lm1,data.frame(x= c(50,60))) predict(lm1,data.frame(x=x), level=.9, interval="confidence") predict(lm.result,data.frame(x=sort(x)), level=.9, interval="prediction") plot(x,y) abline(lm1) ci.lwr=predict(lm1,data.frame(x=sort(x)), level=.9, interval="confidence")[,2] ci.upr=predict(lm1,data.frame(x=sort(x)), level=.9, interval="confidence")[,3] pi.lwr=predict(lm1,data.frame(x=sort(x)), level=.9, interval="prediction")[,2] pi.upr=predict(lm1,data.frame(x=sort(x)), level=.9, interval="prediction")[,3] lines(sort(x), ci.lwr,lty=2) lines(sort(x), ci.upr,lty=2) lines(sort(x), pi.lwr,lty=3) lines(sort(x), pi.upr,lty=3) plot(lm1) ################################ MLR colnames(Cars93) Data_mlr=cbind(Cars93[,c('Price','Width','Weight','Turn.circle','Horsepower')],rnorm(dim(Cars93)[1])) head(Data_mlr) pairs(Data_mlr) cor(Data_mlr) ############################two sample t-test ##A small jewelry shop has 2 kiosks in the mall; we will call them east and west side. ##The number of sales were recorded for a random selection of 50 hours that each store was open; #i.e. 3 means that during a one hour time period 3 sales were made. #The times span all hours the kiosk was open and were taken during intervals: #10am – 11am, 11am-noon, noon-1pm, etc. Sale=read.csv("Sales Data.csv",header = T) #Question: Do the two kiosks perform similarly? (E_bar=W_bar?) East=Sale[,1] West=Sale[,2] t.test(x=East,y=West,alternative='two.sided') ## Variance are not equal t.test(x=East,y=West,alternative='two.sided',var.equal = TRUE) ## variance are equal ########################### One sample t-test mean(East) ## Question: Does expected sales per hour in East side exceed 3.5? t.test(East,alternative='greater',mu=3.5) ########################## Paired two sample t-test #Assume the shop owner modified management system for East side and the second column is the sale after modficiation ## We want to explore whether this modification improves sales. ## Question: X_bar(1)=X_bar(2) t.test(x=East,y=West,alternative='two.sided',paired=TRUE) ## Variance are not equal ##########################Signed rank test --one sample --Wilcoxon signed-rank test (nonparametric) library(MASS) data(immer) ##The immer data frame has 30 rows and 4 columns. ##Five varieties of barley were grown in six locations in each of 1931 and 1932. head(immer) ## Question: Without assuming the data to have normal distribution, ##test at .05 significance level if the barley yields of 1931 and 1932 have identical data distributions. wilcox.test(immer$Y1, immer$Y2, paired=TRUE) ######################### Rank Sum test---2 samples women_weight=c(38.9, 61.2, 73.3, 21.8, 63.4, 64.6, 48.4, 48.8, 48.5) men_weight=c(67.8, 60, 63.4, 76, 89.4, 73.3, 67.3, 61.3, 62.4) # Create a data frame my_data <- data.frame( group = rep(c("Woman", "Man"), each = 9), weight = c(women_weight, men_weight) ) head(my_data) wilcox.test(women_weight, men_weight) res=wilcox.test(weight ~ group, data = my_data) res$statistic res$p.value ######################## one-way ANOVA data(PlantGrowth) #Results from an experiment to compare yields (as measured by dried weight of plants) #obtained under a control and two different treatment conditions. head(PlantGrowth) levels(PlantGrowth$group) ##We want to know if there is any significant difference #between the average weights of plants in the 3 experimental conditions. ANOVA=aov(weight ~ group, data = PlantGrowth) summary(ANOVA) #####################