#This is a basic R script
#R is a software evironment for statistical computing
#It operates via command line which makes it very versatile.
#Each line in a script is a command. It can be called by pressing Ctrl+r in standard RGui and Crtl+Enter in RStudio
#If line starts with a hashtag, it is not a commant, it is a comment.
#So this is not a command
"This is"
#If something is in quotation marks, it is read as text and printed as such.
'This works as well'
#R is basically a very advanced calculator. So it can do things like this as well
5+5
239*999854
2^5 #You can combine commands and comments like this.
#In R commands spaces play no role
1+62 #is the same as
1 + 62
#See?
#R is an object oriented language
#So the code consists moslty from objects and functions
#objects look like this: object
#functions look like this: function()
#You define objects like this:
box<-5
#You just put number 5 in the box
#You can see what is in the box by calling the object
box
#You can multiply the content of the box by 10. What do you think will happen?
box*10
#Or substract something from the number in the box
box-2
#Xou can do this and put the result back in the box. This is verz useful.
box<-box-2
box #Look. Now there is number 3 in the box.
#You do not have to work with individual numbers only.
#You can use series of numbers so called vectors.
c(1,2,8,5,3) #This is a vector. You put numbers in a vector using c() (combine) function.
#You can put vector is a object.
box<-c(1,2,8,5,3)
#You can apply mathematical operations on the vector.
box*10
#And put it perhaps in another object.
boxtimes10<-box*10
#You can mutiply two vectors. What will happen?
boxtimes10*c(1,10) #We get a result and a warning message
c(1,2,3,4,5,6)*c(1,10) #Of course there is no warning if the longer object length is a multiple of shorter object length
#Now the most important R feature comes in the play.
#You can imagine objects as material and funtions as factories.
#Depending on the factory, you can get different product (fries, starch, booze) from the same matreial (potatoes).
#The same factory can also give you different products based on the raw material provided (potato starch vs corn starch).
#Here are some of the most popular functions:
mean(box)
length(box)
sum(box)
#You would be able to construct the mean function from other functions, obviously.
sum(box)/length(box)
#It might be useful in similar cases, to create intermediate objects, especially in more complicated functions
itemsinbox<-length(box)
itemsum<-sum(box)
(result<-itemsum/itemsinbox) #If zou put expression like this in brackets it is simultaneously executed and the objct is called.
itemsinbox #I can still call the previous objects. R remembers.
result
min(box) #minimum
max(box) #maximum
median(box) #median (50% quantile)
quantile(box) #different quantiles (min, max, median and quartiles by default)
quantile(box, probs=c(0.4,0.6)) #You can modify the quantiles
#The thing you modified is called parameter, most functions have them. You can check what options you have if you ask R about a function
?quantile
#You can ask more generally.
??quantile
#As we can see in the help for the quantile function
?quantile
#the next parameter that can be changed is na.rm, defaulting at FALSE
#na.rm means Remove Not Available
#If set to TRUE, it will discard all Non Available values (these "holes") and only later perfroms the calculation.
#We can now add some "holes" in ou box to see the difference
box<-c(box,NA,3,45,6,NA,12) #With the c() function, I can easily "glue together" previously created vector with some extra items.
quantile(box, probs=c(0.4,0.6))
quantile(box, probs=c(0.4,0.6), na.rm=FALSE) #It is because there are these missing values
quantile(box, probs=c(0.4,0.6), na.rm=TRUE) #Settinh na.rm to TRUE solves the problem.
mean(box, na.rm=F) #If R expects TRUE/FALSE specifications somewhere, T/F specification is sufficient
sum(box, na.rm=0) #The same bahavior can be expected from the sum() function and others.
sum(box, na.rm=1) #0/1 serve obviously as a replacement for TRUE/FALSE opposites.
sum(c(TRUE, FALSE, TRUE, TRUE, FALSE)) #In such cases TRUEs and FALSEs can behave like numbers.
#We might check the quantile() help again...
?quantile
#We can see that the probs parameter is specified by another function insted of standard vector. probs = seq(0, 1, 0.25)
#This is a seq() function. This one is very useful. It gives you a sequence of numbers
seq(from=6,to=100,by=2) #If you specify these three parameters, it will give you this kind of sequence.
#Check the help
?seq
#These three parameters are the first. So if you just write three numbers in the function, R will assume that zou specify the firts three parameters in that order.
seq(6,100,2)
#By stating the names of the parameters, you can input them in any order.
seq(to=100,by=2,from=6)
#You can replace on these parameters by parameter length.out which allows you to specify the total length of the sequence.
seq(to=100,by=2,length.out=6) #see?
seq(from=2,to=100,length.out=6) #This is also a useful combination of parameters.
#And we can input sequence like this as a function parameter
quantile(box,na.rm=TRUE,probs=(seq(2,50,3))) #Error! Probs argument can not assume values bigger than 1.
quantile(box,na.rm=TRUE,probs=(seq(2,50,3)/50)) #We can chieve this by dividing the whole sequence by the biggest number in teh sequence.
#This is a good time to introduce another nice function: rep()
rep(1:10,times=3,each=2) #We define the vector from 1 to 10, each item will be repeated twice, this whole sequence will be repeated three times
1:10 #Notice that the easy sequence with the step of size 1 can be written easily by stating the borders with colon
box<-c(1,2,8,5,3) #let's go back to the original box
var(box) #Other important fucntions are estimating the variability of the items in the box, this will give you variance estimation
sd(box) #This will give you standard deviation estimation
#These are the careful variants (that divide the average residual by length - 1)
sum((box-mean(box))^2)/(length(box)-1) #It fits
sqrt(sum((box-mean(box))^2)/(length(box)-1)) #this too
#This way you can calculate the the non-careful version that do not count on original potentially infinite sample
sum((box-mean(box))^2)/length(box)
sqrt(sum((box-mean(box))^2)/length(box))
###How to programme your own function
#We will programme a function that will give you Standard error of mean (of population) on the basis of a small sample.
#It tells you how precisely you have estimated the mean on a vector.
SE<-function(material){ #Start here - I tell the computer that SE is a function tat eats a material
deviation<-sd(material) #We will calculate tha standard deviation of this material - it is a measure of variability
items<-length(material) #It will measure the length of the material
product<-deviation/sqrt(items) #And it will define the product as a ratio between the standard deviation and number of the items
return(product) #by argument return you specify what will be outputed from the factory (fucntion)
} #Ending this brecket end the definition of the function.
SE(box) #Our function works!
SE(c(1,1,1,1,1,1,1,1,2,1,2,1,1)) #If we calculate standard error from a homogeneous vector we get a small number
SE(c(1,5,6,88,99,10,-56,-8,2,5)) #If we use heterogeneous vector...
SE(rep(5:10,each=1)) #Short vector: bigger error
SE(rep(5:10,each=100000)) #Longer vector: smaller error
#Things like that are in the core of statistical inference.
#In statistics we use equivalents of stanard error (of various estimations not just mean) all the time.
#Another very important part of R syntax are square brackets.
#They allow you to address only a part of an object.
#Look:
box[2] #This will take only the second item in the box
box[2:4] #this will take objects 2, 3, and 4 from the box
#If you have a two dimensional object, you need to use two coordinates
#Two-dimensional object is a matrix or a data frame
matrix(c(1,2,4,8,7,9,8,5,6,22,1,85),nrow=3) #Matrix is created like this
matrix(c(1,2,4,8,7,9,8,5,6,22,1,85),nrow=3,byrow=T) #You can fill the matrix by row as well
#We can seve it as an object
bigbox<-matrix(c(1,2,4,8,7,9,8,5,6,22,1,85),nrow=3,byrow=T)
bigbox[2,4] #the addressing now work like this. First number in brackets indicates rows, the second indicates columns.
bigbox[,4] #If you leave one of the coordinates empty, R will return all the numbers that fit the description - whole 4th column
bigbox[1:2,] #Or first and second row - all columns
#Functions that we do not have to programme from scratch, but are very useful, are ordering functions.
#We will create a vector
vec<-c(2,18,6,95,-5)
rank(vec) #rank will replace the items in vector by itegers in consecutive order. Smallest item will get number 1, second smallest number 2 etc.
order(vec) #order will tell you in the rightorder which item is smallest (the fifth item), which is second smallest (first item) etc
#This is useful when you order some vector.
#Easily you can sort the original vector using the brackets adresses.
vec[order(vec)]
#You can achieve this even staight forward with sort() function
sort(vec)
#But you can order one vector along another.
box[order(vec)]
#functions order() and sort() can take parameter decresing=T which flips the outcome
box[order(vec,decreasing=T)]
#Logic operators in R
#logic is a very important part of programming. Logical values are
TRUE
#and
FALSE
#And basic logic operateors consist of and, or and not
TRUE&TRUE
TRUE&FALSE
FALSE&TRUE
FALSE&FALSE
TRUE|TRUE
TRUE|FALSE
FALSE|TRUE
FALSE|FALSE
!TRUE
!FALSE
#It is usefull to know also relational operators.
#Two equal symbols mean "is equal to"
5==5 #five is equal to five
5==2+2 #five is not equal to four
#Not equal to relational symbol is !=
10!=8 #ten is not equal to
9<8 #Less than
5>2 #Greater than
9<=9 #Less than or equal to
12>=6 #Greater than or equal to
#This all can be very useful in data handling.
#If for example you have a long vector.
vector<-seq(100,5866,111)
#and you want to select only odd numbers bigger then 4000, you can do this
#Define TRUE/FALSE vector accordingly using ifelse() function
ifelse(vector%%2==0&vector>4000,T,F) #the %% operator means modulus. Here if the number is even (divisible by 2) it returns 0 and hence the if condition is TRUE.
#And then pick only the numbers with TRUE assigned value from the original vector
vector[ifelse(vector%%2==0&vector>4000,T,F)]
#It is easy
#By the way ifelse() can output any values
ifelse(vector%%2==0&vector>4000,"potato","ice cream")
Or you can use a series of ifelse() within each other (like IF() function in excel)
ifelse(vector%%2==0,ifelse(vector>4000,"potato","ice cream"),"bicycle")
#Last thing we need to cover before we get to real data are loops.
#Loops are a very important part in any programming.
#Basic loop can be called in R with a for() function.
for(i in 1:3){
takethis<-box[i]
print(takethis*2)
}
#This loop means
#For each i between 1 and 3
#take ith item from the box and save it to object takethis
#print the value of thi sitem multiplied by two,
#You can construct new vectors using loop.
#First you have to let R know, that there should be object called
newbox<-NA #for example. It can be empty (NA) at first
for(i in 1:length(box)){ #here i wnat to do the operation for each item in the box, so i go through all of them
takethis<-box[i]
newbox[i]<-takethis*2 #Instead of printing the value, we will save it on ith place in newbox
}
newbox #of course we could achieve this output just by multiplication of the vector, but in more comlicated cases, looping through the vector might be very useful.
#Especially in combination with conditions if and else
newbox<-NA
for(i in 1:length(box)){
takethis<-box[i]
if(takethis%%2==0){
newbox[i]<-takethis*2
}else{
newbox[i]<-NA
}
}
#Here we for example want loop through the box and use only even numbers.
#instead of odd numbers, we want to put NA in the vector
newbox #It worked