################################################################ Introduction to R 

############################################# Simple Calculation 
2+3
2-3
2+3
2/3
sqrt(2) ## sqrt(x) computes the sqaure root of x 
abs(-2)

2==3  ## Double equal sign is a logical operator to test whether objects on both sides are equal 
      ## The return result is FALSE or TRUE

sqrt(2)*sqrt(2) == 2  ## Oops! R can't recognize thy are equal 

## Additional Supporting Material:  

#1. Appendix G "Computational Precision and Floating Point Arithmetic", pages 753-771
#of Statistical Analysis and Data Display: An Intermediate Course with Examples in R, 
#Richard M. Heiberger and Burt Holland (Springer 2015, second edition). 
# http://link.springer.com/content/pdf/bbm%3A978-1-4939-2122-5%2F1.pdf.

#2.David Goldberg (1991), "What Every Computer Scientist Should Know About Floating-Point Arithmetic", 
#ACM Computing Surveys, 23/1, 5-48. 
#http://www.validlab.com/goldberg/paper.pdf.

all.equal(sqrt(2)*sqrt(2),2) ## In order to fix this porblem, we can use all.equal 

############################################# Simple Manipulation 

### Getting Help 

?mean
help(mean)
help.start() ## Start the HTML version of R's online documentation 
help.search('me') ## searching the help system for documentation matching a given character string in the (file) 
                  ##name,title,concept or keyword entries. 


### Quit R 
quit()

##############################################  Objects in R 

### How to give a value to a variable 
x<-123
y="happy"
1234->z 
w="happytwo"  ## Double quote marks and single quote mark have the same function 

### Structures of Obejcts 

## Double and integer: 
#They are the two most common numeric classes used in R 
#R automatically converts between these two classes when needed for mathematical purposes. 
#As a result, it's feasible to use R and perform analyses for years without specifying these differences.
x=c(1,2,3)
typeof(x) ## Determines the type of one obejct 

x=c(1L,2L,3L) ## L suffix set the value 1 to be an integer 
typeof(x) 

## Charatcer:
y='happy'
typeof(y)
mode(y)

## list:
w=list(22,'happy',TRUE)
typeof(w)
mode(w)

## Logical Value:
zz=TRUE
ww=T
xx=True  # Error----we can't use True to present logical value 
typeof(zz)
typeof(ww)

### Change the Type of Obejcts:

x=1.2345
typeof(x)
as.integer(x)
typeof(as.integer(x))  # The type changes from double to integer 

########################## Sequences 
num=1:8
numa=seq(from = 1, to = 8, by = 1)
numb=seq(1,8,1)
numc=seq(8) ## sames as 1:8

num2=seq(1,20,length = 25) 

num3=rep(1:4, 3) 
num4=rep(c(3,5,7), 2)
num5=rep(1:5, 5:1) 

########################## List 


## Organizing Data: List 

Class1=list(dept="STAT", number = "1234", title = "Introduction to R", enroll = 100,studentgroup= c(1,1,2,3,4,2,3,4,3,2,3,1))

mode(Class1)
is.list(Class1) 

## Specific a particular element in a list 

Class1[3]
Class1[[3]]
class(Class1[3])# single bracket returns a list element 
class(Class1[[3]]) # double brackets return a list of a single element 

Class1$title 

Class1$studentgroup
Class1$studentgroup[4] 

###################### Vector and Matrix 

## Organizing Data: Vector---one-dimensional matrix 

Age=c(22,25,32,40,24,20,27,28)
Sex=c("Male","Female","Female","Male","Female","Male","Male","Female")
Name=c('Alex','Nancy','Susan','Jackson','Vivan','David','John','Mary')
class(Age)
Age[1]
Age[Sex=='Male']

# Logical Indexing in Vector

Sex=='Male'
Age>20 
Name[Age>20&Sex=='Male']## & mean and 
Name[Age<18| Age>38]  ## | means or
Name[Sex!='Male'] ## != means not equal 

#The %in% function goes through every value in the vector x, and returns TRUE
#if it finds it in the vector of possible values - otherwise it returns FALSE.
#x %in% vector of possible values 

Name[Age %in% c(22,28)]


# Basic functions of Vector 
X=c(11,22,10,2333,221,666,34,2131,33)
sort(X)
sort(X,decreasing = TRUE)

mean(X)
var(X)
max(X)
min(X)
median(X)
summary(X)



## Organizing Data: Matrix

matrix(1:12, nrow = 4, ncol = 3,byrow=T) 
matrix(1:12, nrow = 4, ncol = 3,byrow=FALSE) 


matrix(1:13, 4, 3) ## Check the warning message 
matrix(1:13, 4, 5) ## Check the warning message  

mat=matrix(1:12, nrow = 4, ncol = 3,byrow=T) 
dim(mat) 
mat2=matrix(1:12, nrow = 4, ncol = 3,byrow=FALSE) 
rbind(mat, mat2)
cbind(mat, mat2)

kids <- matrix(c(42, 66, 41, 63, 41, 53, 74, 51, 48, 68), ncol = 2, byrow = T)
dimnames(kids) <- list(c("Tom", "John", "Jack", "Mary", "Susan"), c("Height", "Weight"))

attributes(kids) ## Access an object's attributes 


m1=matrix(1:6, nrow=2, ncol=3)
m2=matrix(7:12, nrow=3, ncol=2)
m1%*%m2 ## inner product
m1%o%m2 ## outer product

v1=c(1, 3, -5)
v2=c(4, -2, -1)

v1%o%v2
diag(v1%o%v2)



########################  Factor 


directions=c("North", "East", "South", "West")
directions.factor=factor(directions)
directions.factor

Blood_pres=seq(80,250,by=6)

B_fac=cut(Blood_pres,breaks=c(79.9,120,200,250),labels=c("H","A","D"))  ## (a,b] left open and right closed subset 

## cut convert numeric to factor 

levels(B_fac)=c("Healthy","Attention","Dangeous")
B_fac

B_fac=ordered(B_fac,levels=c('Dangeous',"Attention","Healthy"))
B_fac

## convert factor to character 

gender=c(rep("male",20), rep("female", 30)) 
gender_factor=as.factor(gender)
levels(gender_factor)=c("F","M")
gender_factor


gender_re=as.character(gender_factor)

###################### Data Frames

## Organizing Data: Data Frames

## Compared with matrix, data frames are more generally and could contain different types of data. 



kiddata=data.frame(kids)

attributes(kiddata) 

kiddata$Height 
kiddata$Weight 
kids$Height  ## This does not work for matrix


# To show the difference between data frame and matrix 
team=c("Bears", "Eagles", "Dolphins")
wins=c(5,11,8)
losses=c(9,3,6)
football=data.frame(team, wins, losses)
football_2=cbind(team,wins,losses)

# Dataframes functions
library(help='datasets')

?ChickWeight ## A built-in datasets for weight versus age of chicks on different diets 
head(ChickWeight)
head(ChickWeight,n=4)
tail(ChickWeight,n=5)

View(ChickWeight) ## See the dataframe in a new window 

nrow(ChickWeight)
ncol(ChickWeight)
dim(ChickWeight)

rownames(ChickWeight)
colnames(ChickWeight)
names(ChickWeight)


summary(ChickWeight)

ChickWeight$weight

table(ChickWeight$Diet) ## It returns a table of the counts at each combination of factor levels. 


# Add a new column on one dataframe 
survey=data.frame("index" = c(1001, 1002, 1003, 1004, 1005),
                     "age" = c(24, 25, 42, 56, 22))
survey$sex=c('m','m','f','f','m')
survey # A new column named 'sex' comes out 

# Slice a dataframe 

survey[1,]
survey[,'index']
survey[1,2]

survey[survey$sex=='m',] # Slicing with logical values
survey[survey$sex=='m'&survey$age<24,]

subset(survey,subset = sex=='m',select=index)
subset(survey,subset=sex=='m'&age<23)

data("airquality")

subset(airquality, Temp > 80, select = c(Ozone, Temp))
subset(airquality, Day == 1, select = -Temp)
subset(airquality, select = Ozone:Wind)



########################################### Working Directory 

getwd()  ## get the working directory 
setwd("C:/Users/Yiqing/Desktop")  ## change the working directory 
getwd()

ls() ##list jects in the working directory 


### Save: Save one object "survey" into a new file. 
save(survey,file='Study1.RData')


## Save.image():If you have many objects that you want to save,
#then using save can be tedious as you'd have to type the name of every object. 
#To save all the objects in your workspace as a .RData file, use the save.image() function.

save.image(file='Study2.RData')

load(file='Study1.RData')

rm(survey)

#### Furthermore, we can use write.table() to export data as a simple .txt text file. 

survey=data.frame("index" = c(1001, 1002, 1003, 1004, 1005),
                  "age" = c(24, 25, 42, 56, 22),'sex'=c('m','m','f','f','m'))


write.table(x=survey,file='survey.txt',sep='\t',row.names = FALSE)
##x---The object you want to write to a text file, usually a dataframe

##file=-----The document's file path relative to the working directory unless specified otherwise.
#for example file = "mydata.txt" saves the text file directly in the working directory, 
#while file = "data/mydata.txt" will save the data in an existing folder called data inside the working directory.
#You can also specify a full file path outside of your working directory 
#(file = "/Users/CaptainJack/Desktop/OctoberStudy/mydata.txt")

##sep-----A string indicating how the columns are separated. 
#For comma separated files, use sep = ",", for tab-delimited files, use sep = "\t"

##row.names--A logical value (TRUE or FALSE) indicating whether or not save the rownames in the text file.
#(row.names = FALSE will not include row names)


read.table("https://faculty.smu.edu/ngh/stat6304/dataset.txt") ## read.table to help us to read dataset into R 

read.table("https://faculty.smu.edu/ngh/stat6304/dataset.txt",header=T)

read.table("https://faculty.smu.edu/ngh/stat6304/dataset.txt",header=T,na.strings = '.')



## header: A logical value indicating whether the data has a header row 
#that is, whether the first row of the data represents the column names.

##na.string:a character vector of strings which are to be interpreted as NA values.



read.csv("Sales Data.csv",header = T)





######################################### Simple Functions 

# Area of a circle  S=pi*r^2

area_circle=function(r){
  pi*r^2
}

area_circle(10)

# If function 
Obesity=function(bmi){
  if(bmi>24){
    return('Attention: Obesity')
  }
  else{
    return('You are in the healthy range')
  }
}
Obesity(25)
Obesity(23)

## While function 

#Sum from 0 to N

SumofN=function(N){
  value=0
  i=0
  while(i<=N){
    value=value+i
    i=i+1
  }
  value
}
SumofN(50)


################################################Simple Plots 
x <- seq(-3,3, length = 1000) 
y <- exp(-x*x/2)/sqrt(2*pi)
plot(x,y) 
dev.off()

library(MASS)
plot(Cars93$MPG.city, type = "p") ## p--points l--lines b--both n--no plotting 
plot(Cars93$MPG.city, type = "l") 

plot(1:10,rep(5, 10), type = "l")
plot(1:10,rep(5, 10), type = "l", lwd = 2)  # lwd---Line width 

plot(1:10,rep(5, 10), type = "l", lty = 2) # lty---line type

plot(1:10,rep(5, 10), type = "l", col = 'red')

plot(1:10,rep(5, 10), type = "l", col = '#3F8C41')

# Get the color you want !
#http://www.rapidtables.com/convert/color/rgb-to-hex.htm

colors()
?plot.default
# Add titles 
plot(Cars93$MPG.city, Cars93$Weight, xlab = "", ylab = "")  
title(main = "Scatter plot of Cars Weight vs. City MPG", sub = "this is a subtitle", xlab = "City MPG", ylab = "Weight") 
# Add points
heavy <- Cars93$Weight > 3500
plot(Cars93$MPG.city, Cars93$Weight, type = "n")
points(Cars93$MPG.city[heavy], Cars93$Weight[heavy], pch = 16, col = 4) 
points(Cars93$MPG.city[!heavy], Cars93$Weight[!heavy], pch = 5, col = 6) 

# Add axis 
plot(Cars93$MPG.city, Cars93$Weight, axes=F)
axis(1)
axis(2)
axis(3)
axis(4) 
box()

# change axis 
plot(Cars93$MPG.city, Cars93$Weight, axes=F)
axis(1, at = c(20, 30, 40), labels = c("A", "B", "C"))
axis(2, lty = 2, lwd = 3, col = "red", font = 2)  ## font: 1=plain, 2=bold, 3=italic, 4=bold-italic

#Add text 
plot(1:10, 1:10, type = "n")
text(3,1,"This is font 1", font = 1, cex = 2) ## font: 1=plain, 2=bold, 3=italic, 4=bold-italic 5= Greek letters
text(3,2,"This is font 2", font = 2, cex = 2)
text(3,3,"This is font 3", font = 3, cex = 2)
text(3,4,"This is font 4", font = 4, cex = 2)
text(3,5,"This is font 5", font = 5, cex = 2)

# Add lines 
plot(Cars93$MPG.city, Cars93$Weight)
lines(c(20, 30), c(2000, 2000))

dev.off()

# Box Plot

boxplot(Cars93$Price) 

boxplot(Cars93$Weight~Cars93$DriveTrain, col = 2:4)


# Bar plot 
counts <- table(Cars93$Cylinders)
barplot(counts, col = 1:6, xlab = "Number of Cylinders", angle=seq(90,180,len=6), density=20) 

##Pie Chart 
pie(counts, col = rainbow(length(counts))) 

##Histogram 

hist(Cars93$Weight) 

##Starplot 
data(longley)
stars(longley[1:16,])

# Faceplot 
install.packages("aplpack")
library(aplpack)
faces(longley[1:16,],face.type=0) ## face.type=0---line drawing faces 
faces(longley[1:16,],face.type=1) ## face.type=1---the elements of the faces are painted 

#map
install.packages('maps')
library(maps)
map()
map("state") 

## 3d scatterplot 
install.packages("scatterplot3d")
library(scatterplot3d)
scatterplot3d(Cars93$MPG.city, Cars93$Weight, Cars93$Horsepower, pch = 15) 

################################# Simple Linear regreesion 

x=c(18,23,25,35,65,54,34,56,72,19,23,42,18,39,37)
y=c(202,186,187,180,156,169,174,172,153,199,193,174,198,183,178)
plot(x,y) # make a plot

lm1=lm(y~x)
summary(lm1)
coef(lm1)
resid(lm1)
fitted(lm1)
confint(lm1)

predict(lm1,data.frame(x= c(50,60)))
predict(lm1,data.frame(x=x), level=.9, interval="confidence")
predict(lm.result,data.frame(x=sort(x)), level=.9, interval="prediction")  


plot(x,y)
abline(lm1)
ci.lwr=predict(lm1,data.frame(x=sort(x)), level=.9, interval="confidence")[,2]
ci.upr=predict(lm1,data.frame(x=sort(x)), level=.9, interval="confidence")[,3]
pi.lwr=predict(lm1,data.frame(x=sort(x)), level=.9, interval="prediction")[,2]
pi.upr=predict(lm1,data.frame(x=sort(x)), level=.9, interval="prediction")[,3]

lines(sort(x), ci.lwr,lty=2)
lines(sort(x), ci.upr,lty=2)
lines(sort(x), pi.lwr,lty=3)
lines(sort(x), pi.upr,lty=3)

plot(lm1)

################################ MLR 

colnames(Cars93)
Data_mlr=cbind(Cars93[,c('Price','Width','Weight','Turn.circle','Horsepower')],rnorm(dim(Cars93)[1]))
head(Data_mlr)

pairs(Data_mlr)
cor(Data_mlr)


############################two sample t-test

##A small jewelry shop has 2 kiosks in the mall; we will call them east and west side.
##The number of sales were recorded for a random selection of 50 hours that each store was open; 
#i.e. 3 means that during a one hour time period 3 sales were made. 
#The times span all hours the kiosk was open and were taken during intervals:  
#10am – 11am, 11am-noon, noon-1pm, etc. 


Sale=read.csv("Sales Data.csv",header = T)

#Question: Do the two kiosks perform similarly? (E_bar=W_bar?)
East=Sale[,1]
West=Sale[,2]

t.test(x=East,y=West,alternative='two.sided') ## Variance are not equal 
t.test(x=East,y=West,alternative='two.sided',var.equal = TRUE) ## variance are equal 


########################### One sample t-test 

mean(East)
## Question: Does expected sales per hour in East side exceed 3.5? 
t.test(East,alternative='greater',mu=3.5)


########################## Paired two sample t-test 
#Assume the shop owner modified management system for East side and the second column is the sale after modficiation
## We want to explore whether this modification improves sales. 

## Question: X_bar(1)=X_bar(2)

t.test(x=East,y=West,alternative='two.sided',paired=TRUE) ## Variance are not equal 

##########################Signed rank test --one sample --Wilcoxon signed-rank test (nonparametric)


library(MASS)
data(immer) ##The immer data frame has 30 rows and 4 columns.
##Five varieties of barley were grown in six locations in each of 1931 and 1932.
head(immer)

## Question: Without assuming the data to have normal distribution, 
##test at .05 significance level if the barley yields of 1931 and 1932 have identical data distributions.

wilcox.test(immer$Y1, immer$Y2, paired=TRUE) 


######################### Rank Sum test---2 samples

women_weight=c(38.9, 61.2, 73.3, 21.8, 63.4, 64.6, 48.4, 48.8, 48.5)
men_weight=c(67.8, 60, 63.4, 76, 89.4, 73.3, 67.3, 61.3, 62.4) 
# Create a data frame
my_data <- data.frame( 
  group = rep(c("Woman", "Man"), each = 9),
  weight = c(women_weight,  men_weight)
)
head(my_data)

wilcox.test(women_weight, men_weight)

res=wilcox.test(weight ~ group, data = my_data)
res$statistic
res$p.value

######################## one-way ANOVA 

data(PlantGrowth) 
#Results from an experiment to compare yields (as measured by dried weight of plants) 
#obtained under a control and two different treatment conditions.
head(PlantGrowth)

levels(PlantGrowth$group)

##We want to know if there is any significant difference 
#between the average weights of plants in the 3 experimental conditions.

ANOVA=aov(weight ~ group, data = PlantGrowth)
summary(ANOVA)


#####################