My First Script.r
My First Script.r
R
library(tidyverse)
library(dslabs)
#right steps from video course below, type in console and a result will be returned
0.15*19,71
0.15*19.71
#ingresar decimales con punto, no con coma
install.packages("dslabs")
library(dslabs)
install.packages(tidyverse)
#to installpackages, the word goes with quotes"”
install.packages("tidyverse")
library(tidyverse)
#install.packages() to download and install packages from CRAN, installed.packages() to
find and retrieve details of installed packages
installed.packages()
#tools->install packages->if you don't remember the name or spelling of the package
#In the Console, define objects a,b,c with the arrow <- between the object, name arrow
value
a <- 1
b <- 1
c <- -1
#objeto=variable, cada objeto o variable tiene un valor(value)
#object refers "stuff stored in R"
#to see the value defined for one object, type a and enter or print(a) and enter. To see
all the objects-variables-vectors defined, type ls()
a
b
c
ls()
#ls() to see all the variables saved in the workspace or environment
#when variables are not defined: ERROR
x
(-b + sqrt(b^2 - 4*a*c)) / (2*a)
(-b - sqrt(b^2 - 4*a*c)) / (2*a)
#type bascaras formula and obtain results +/- 0.618 'cause variables are previously well
defined (a,b,c)
#una vez definidas las variables, el proceso de analisis de datos-data analysis process-
se puede definir como serie de funciones aplicadas a los datos. R tiene muchas funciones
predefinidas que se cargan mediante la instalacion de paquetes p.ej installpackages,
library, ls, sqrt
#para evaluar una función siempre se usan los paréntesis
log(8)
log(a)
exp(1)
log(2.718282)
log(exp(1))
#nested functions are evaluated from inside to outside, primero se analiza el exp(1) y
luego el log(del resultado), se analiza desde adentro-hacia afuera de derecha-izquierda
help("log")
?log
#search the help system in R with help() or ?name
#arguments of functions are things that require the function to be executed, args() shows
default values-arguments of a function
args(log)
#arguments needed for log are x-a value- and the base-natural log base 1. To define,
specify arguments, use the equal = sign#
#Remember: arrow <- to define variables or objects, equal = to define arguments of the
function
log(8, base=2)
log(x=8, base=2)
log(8,2)
2^3
help("+")
?"+"
#R incluye bases de datos-data sets- precargadas para practicar y probar las funciones
data("co2")
pi
Inf
#variable names have to start with a letter, they can't contain spaces, do not use names
stored in R, use _ underscores
a <- 3
b <- 2
c <- -1
#redefine variables a, b, c, now their value are a3, b2, c-1
solution_1 <- (-b + sqrt(b^2 - 4*a*c)) / (2*a)
solution_2 <- (-b - sqrt(b^2 - 4*a*c)) / (2*a)
#example: variable names for the solution of bascaras quadratic formula and their
values(the formula)
#by creating and saving a Script in the editor, we would not need to retype everything if
we want to change something
#make comments: when the R code starts with a # numeral, the line is not evaluated
rm(women)
#rm to remove data in the workspace
#2.3 exercises
n <- 1
sum_integers <- n*(n+1)/2
#if n <- 1 sum_integers with formula is 1
n <- 100
sum_integers <- n*(n+1)/2
#if n <- 100 sum_integers with formula is 5050
n <- 1000
sum_integers <- n*(n+1)/2
#if n <- 100 sum_integers with formula is 500500
n <- 1000
x <- seq(1, n)
sum (x)
#answer b: seq creates a list of numbers and sum adds them up.
#seq by default generates regular sequences from 1 to 1
#evaluate a function means replace the argument with a number
log(x=(sqrt(100)), base=10)
sqrt(100)
log(10,10)
log(10^x)
log10(x^10)
log(exp(x))
exp(log(x, base=2))
log(1) [1] 0
exp(0) [1] 1
log(exp(0))[1] 0
#log-exp son funciones inversas, el logaritmo de 1 es = 0, la exponencial de 0 es = 1
#respuesta c: log(exp(x)) log-exp por default en base 1 son funciones inversas que
devuelven en valor de x
#use the function class to know or determine the type of an object in R
class(a)[1] "numeric"
class(ls)[1] "function"
#store data in R: just one number per variable is not useful, store datasets-conjunto de
datos- in data frames-cuadro de datos
#load library dslabs and select the data we want to load
library(dslabs)
data("murders")
class(murders)[1] "data.frame"
#object:murders Rows-filas:observations columns-vectors, each column in a dataframe is
a vector
#str shows the structure of an object
str(murders)
#the object "murders" is a data frame with 51 observations and 5 variables
#type head(murders) to see the common order in data science, obs-observations in Rows and
variables in columns
#head() shows the first six components of an object, tail() shows the last six components
head(murders)
#para el analisis es necesario acceder a las diferentes variables representadas, use the
accessor dollar sign $
murders$population
names(murders)
#names function allows us to see only how variables are named
#we have seen before variables names with the head function(columns are varibles) and
with the str function (in rows with the $ dollar sign)
#the accessor $ can be used now to access any columns in the data frame
murders$population
#la info en la variable "population" no es 1 solo numero, son 51 numeros (cantidad de
"observations")
#el orden de las entradas en la lista "murders$population" conserva el orden de las filas
(rows-observations) en la tabla de datos
#this is useful to manipulate one variable based on the results of another variable
#Vectors: single number or list of entries in a variable/object
#length: function to know How Many entries are in a vector
pop <- murders$population
#1st define the new object-variable pop
length(pop) [1] 51
#analyze the size with length function (cantidad de observaciones)
class(pop) [1] "numeric"
#Vectors can be: numeric, character or logical vec.
#numeric vectors: entries must be numbers
#character vectors: entries must be character strings
#logical vectors: entries must be true or false
#quotes: use them to distinguish between variable names and character strings-names
a [1] 3
class(a) [1] "numeric"
"a" [1] "a"
class("a") [1] "character"
class(murders$state) [1] "character"
# in logical vectors == is a relational operator
z <- 3 == 2
z [1] FALSE
z <- 3 == 3
z [1] TRUE
class(z) [1] "logical"
?Comparison
#to see how to use other relational operators ?comparison
class(murders$region) [1] "factor"
#Factors: to store categorical data
levels(murders$region)
#levels function to see categories-levels labels-etiquetas of a variable
#nlevels function to see only the number of levels in a variable
murders$state
murders$region
#variable "state" is a character vector, variable "region" is a factor (categorical data)
#factors are necessary to fit statistical models in categorical data
#data frames are a type of list where variables can also be extracted with the accessor $
#use [["population"]] double square brackets and quotes instead of $ dollar sign accessor
to acesss variables
#exercises, 1st use str function, 2nd answer: str of murders shows state name,
abbreviation state name, state's region, state's population and total of murders
str(murders)
names(murders)
#names function to see only the variable names
murders$abb
a <- murders$abb
class(a)
#1st extract $ abbreviations, 2nd assign state abb to a 3rd determine the class of a:
"character"
murders[["abb"]]
b <- murders[["abb"]]
identical(a, b)
#1st access abbreviations with [[""]] 2nd assign state abb to b, 3rd use the identical
function to determine if a and b are equal: TRUE
a == b
levels(murders$region)
length(murders$region)
#With one line of code, use the function levels and length to determine the number of
regions defined by this dataset
length(levels(murders$region)) [1] 4
#table function takes a variable and returns the frecuency of each element
#use the table function to see number of states per region
table(murders$region)
#section 1 assessment
#1st define a2, b-1, c-4 and solve with the quadratic equation.
a <- 2
b <- -1
c <- -4
(-b + sqrt(b^2 - 4*a*c)) / (2*a)
(-b - sqrt(b^2 - 4*a*c)) / (2*a)
solution_1 <- (-b + sqrt(b^2 - 4*a*c)) / (2*a)
solution_2 <- (-b - sqrt(b^2 - 4*a*c)) / (2*a)
#don't forget to redefine solution 1and2 variables, cause they're saved previously with
other values of a,b,c
#that's why I get a wrong solution of the quadratic equation at first time
log4(1024)
log(1024, base=4)
#log4 is not a function, change the natural log by base=4
library(dslabs)
data("movielens")
str(movielens)
names(movielens)
class(movielens$genres)
levels(movielens$genres)
nlevels(movielens$genres)
#nlevels function to see only the number of levels, not the names levels
#Vector: basic unit to store data in R. Entries in a variable/object
#complex datasets can be broken into components/vectors
#each column in a dataframe is a vector
#function c -Concatenate: to create Vectors
codes <- c(380,124,818)
codes
class(codes)[1] "numeric"
country <- c("italy","canada","egypt")
country
class(country)[1] "character"
#to create character vectors entry the names with quotes
#quotes are exclusive for character vectors
#in R, typing double quotes " are equal to single quotes '
#para diferenciar variables de vectores y que R no busque variables con esos nombres en
vez de definir el vector
codes <- c(italy=380,canada=124,egypt=818)
#to define the vector countrycodes, use name=number to connect entries
codes
class(codes)
#the three numbers are associated with three countries
#though, class(codes) is still a numeric vector
#use names function to assign the entries of a vector as c function does
names(codes) <- country
names(codes) <- country is the same as country <- c("italy","canada","egypt")
identical(names(codes),c("italy","canada","egypt"))
#function seq to create vectors, it generates sequences by default by 1 to 1
seq(1,10)
#consecutive integers-enteros from 1 to 10
#1st argument defines the start, 2nd argument defines the end
#3rd argument defines how to jump by (by default it's 1)
seq(1,10,2)
1:10
#use : instead of seq only for consecutive sequences by default 1
class(seq(1,10))
class(seq(1,10,2))
#to access variables: $ dollar sign or [] square brackets
#to access specific parts of a vector: [] square brackets
codes[c(1,3)]
#multi-entry-vector, sequences are useful to get more than one entry
codes[1:2]
codes[seq(1,2)]
codes["canada"]
codes[c("italy","egypt")]
#access the entries with [] and " quotes if they are character vectors-nominales
#subsetting-subconjunto: to access specific parts of a vector by using [] to access
elements of a vector
#coercion: attempts by R to be flexible with data Types
#before throwing an error, prebuilt functions in R try to guess the meant of the entry
that doesn't match
x <- c(1,"canada",3)
x
class(x)
#vectors must be all of the same type
#x must be an error, instead, prebuilt functions in R convert 1 and 3 into characters
#R coerced the data into a character string
#R also can force a specific coercion
x <- 1:5
y <- as.character(x)
y
as.numeric(y)
y
#as.character function turns numbers into characters
#as.numeric function turns characters into numbers
#many public datasets make appear numbers as characters, that's why this function is
useful
#NA not available: appears when data is missing and a function can't be coerced into
another one
x <- c("1","b","3")
as.numeric(x)
#R don't guess what to do, 1 and 3 are converted into numbers but b is an NA missing
value
#NA is often in real life datasets where sometimes values are missing
#don't forget to redefine the object x coerced into numeric, if not, it is still class
character
class(x)
x <- as.numeric(x)
#2.8 exercises
#create temp vector using function c
temp <- c(35,88,42,84,81,30)
#create city vector using c function
city <- c("Beijing","Lagos","Paris","Rio de Janeiro","San Juan","Toronto")
#use names function to associate temp-city
names(temp) <- city
temp
#use [] and : to access temp of the fisrt three cities
temp[1:3]
#[] instead of $ and : to extract the seq(1,3) by default 1 to 1
temp[c("Paris","San Juan")]
#c function and [] " to extract plus than 2 elements
temp["Paris"]
#square brackets and quotes to extract only 1 element, no c needed there
12:73
seq(1,100,2)
#seq positive odd-impar numbers smaller than 100
seq(6,55,4/7)
length(seq(6,55,4/7))
#seq from 6 to 55 not by default! by 4/7 and use length function to know how many numbers
are in the list
a <- seq(1,10,0.5)
class(a)
a <- seq(1,10)
class(a)
#class(a) could be numeric(incluye decimales) or integer(solo numeros enteros)
a <- 1
class(a)
#by default class is numeric, if integer class is needed, define the object a with an L
after the number
a <- 1L
class(a)
x <- c("1","3","5")
class(x)
as.integer(x)
#integers are less heavy than numbers, that's useful to save computer memory space in big
data
#Sort function <- sorts the vector in increasing order
library(dslabs)
data(murders)
sort(murders$total)
#smallest number of murders: 2 biggest number of murders: 1257
#Order function <- produces the index needed to obtain the sorted vector
x <- c(31,4,15,92,65)
x
sort(x)
order(x)
#function Order works as an index that shows how to sort the vector
index <- order(x)
x[index]
x[1]
order(x)
#Order is the Index that sorts, puts in order the vector x
sort(x)
#Vector: state, access: $, access entries: [] entries follows the same order as rows in
the table
murders$state[1:10]
murders$abb[1:10]
#1st define the index with the vector total murders in order
index <- order(murders$total)
#2nd link the names state-abbreviations with the index
murders$abb[index]
#now we see the states in ascendant order by murder totals
#max - min function
#to get the entry with the biggest or the smallest value
max(murders$total)
#which.max -which.min function
#to get the index where this number resides, number of row where we find the value
which.max(murders$total)
i_max <- which.max(murders$total)
i_max
murders$state[5]
murders$state[i_max]
#now that we have the max (biggest) and the which.max (index)
#find out the name state accessing entries with [] Ej: [5]
#or define and access the entries Ej: [i_max]
min(murders$total)
which.min(murders$total)
i_min <- which.min(murders$total)
i_min
murders$state[46]
murders$state[i_min]
#rank function <- gives the rank-jerarquía of the original vector from smallest to
biggest
x
rank(x)
#Ej: x is the orginial vector with it's numbers
#sort function <- sorts-ordena the original vector numbers
sort(x)
#order function <- gives the index needed to get the sorted data
order(x)
#2.10 Exercises
library(dslabs)
data("murders")
murders$population
#Define the vector pop sorted-ordenado, then extract the smallest population size using
square brackets
pop <- murders$population
pop <- sort(murders$population)
pop[1][1] 563626
#use order-index to find the smallest pop size
order(pop)
pop[1][1] 563626
#use which.min in one line to find the smallest pop size
which.min(pop)
which.min(murders$population)
#tip: the vector pop was defined sorted, to use which.min use muerders$pop or redefine
pop not sorted
pop <- order(murders$population)
murders$state[pop]
pop[1]
states <- murders$state
states[51]
#Wyoming is the state with the min.population
class(city)
class(temp)
city_temps <- data.frame(city,temp)
class(city_temps)
city_temps <- data.frame(name=city, temperature=temp)
#data.frame function, insert variables in two ways
index <- order(murders$population)
pop <- order(murders$population)
rank(pop)
ranks <- rank(pop)
my_df <- data.frame(states,ranks)
my_df
#after creating a new data frame with ranks, order the states from last to most populous
index
states <- murders$state[pop]
states
my_df <- data.frame(states,ranks)
my_df
#1st define index-order, 2nd redefine states with population in order, 3rd redefine my_df
library(dslabs)
data("na_example")
na_example
str(na_example)
class(na_example)
#the variable na_example is integer numbers but it also has many NA values
mean(na_example)
#mean function: media, calcula el promedio-average
#is.na function that tells which entries are missing-NA's
is.na(na_example)
?Comparison
ind <- is.na(na_example)
na_example[ind]
length(na_example[ind])
#1st define the variable called ind
#2nd access [ind] from the vector na_example to know all missing values-NA's
#3rd use length function to know how many NA's does the vector have
na_example[!ind]
length(na_example[!ind])
#logical operator ! is a logical negation "not" !NOT
#follow the same steps using ! to know how many numeric values does the vector have
#there are 855 numeric values and 145 NA-missing values
mean(na_example[!ind])
#al principio la media mean=NA, ahora: mean=numero, por que sacamos los NA's con ! antes
de hacer el calculo
#sum function: sum of all values -the same as length: how many values
murders$state[which.max(murders$population)]
murders$state[which.min(murders$population)]
#access variables$ and entries[] to confirm California has the largest pop and Wyoming
has the smallest pop
max(murders$population)
min(murders$population)
#comparison between states? compare murders per capita-state by total state's population
#rm() or remove() to erase objects defines on the environment
rm(state)
#height-altura units: 1 inche-pulgada son 2.54 cm
heights <- c(69,62,66,70,70,73,67,73,67,70)
heights*2.54
#we get each entry from heights variables in cm
mean(heights)
heights-69
#if the mean-average is 69, height minus mean give us how long are the entries up or down
from 69
#Vector Arithmetic Operations can be well applied in large vectors of the same length
murder_rate <- murders$total/murders$population*100000
#murder rate-tasa: total murders of each state / population of each state * 100000
people-cada 100mil habitantes
murder_rate
murders$state[order(murder_rate)]
murders$state[order(murder_rate, decreasing=TRUE)]
#order states by murder_rate (default-de menor a mayor) then by the logical
decreasing=true (de mayor a menor)
#Columbia is the most dangerous state, Vermont is the less dangerous state (cada 100mil
habitantes)
#2.12 Exercises
(5/9)*(32-32)
#0°C son 32°F con la formula °C = (5/9)*(°F-32)
(5/9)*(temp-32)
my_df <- data.frame(city,temp,(5/9)*(temp-32))
temp_Celsius <- (5/9)*(temp-32)
my_df <- data.frame(city,temp,temp_Celsius)
my_df
str(my_df)
(pi^2)/6
x <- seq(1,100)
sum(1/x^2)
#it can be generalized as the sum of 1/(x^2) where x is a seq from 1 to 100
#answer: (pi^2)/6 result is the same as sum(1/x^2)
murder_rate <- murders$total/murders$population*100000
mean(murder_rate)
#murder's average-promedio 2,779 murders rate-tasa cada 100mil habitantes
#Matrix are similar to dataframes, but entries must be all the same type
mat <- matrix(1:12, 4, 3)
mat
#matrix function: specify entries (numeric), number of rows(4), and number of columns(3)
#[] to access matrix entries type: specific row and column, seq of rows and columns, let
a blank space for all the row-column
mat[2,3]
mat[2,]
mat[,3]
mat[,2:3]
mat[1:2,2:3]
#as.data.frame function to convert matrix into data.frame with vectors
as.data.frame(mat)
#[] can also be used to access rows and columns in a data frame
murders[51]
data("murders")
murders[51,]
murders[51,4]
murders[51,1:4]
name <- c("Mandi", "Amy", "Nicole", "Olivia")
distance <- c(0.8, 3.1, 2.8, 4.0)
time <- c(10, 30, 40, 50)
#convert in one line of code time in minutes to hours
time*1/60
time <- time*1/60
#reach the speed(espacio/tiempo) in miles per hour
distance/time
#6.2 miles/hour is faster than 4.2 miles/hour, in one hour Amy runs more distance
#remember in Assessments, three digits are 0.833
#() function command, [] entries of a vector, rows and columns
#indexing-clasificar a vector based on another vector
murder_rate
#find which states has a lower-equal murder rate than 0.71 per 100mil hab
index <- murder_rate < 0.71
index <- murder_rate <= 0.71
#define index as murder rate less < , less or equal <= than
index
#relational operators gives true(less or equal than 0.71) of false results
murders$state[index]
#Vectors can be index with logicals, access the entry [index] by states
#sum function: logical vectors get coerced into numeric, true is 1, false is 0
sum(index) 5
length(murders$state[index]) 5
#to know the number of states with murder rate menor o igual a 0.71 sum(index) gives the
TRUE sum, length counts the number of states with murder_rate less than 0.71
#"and" & function: if we want two conditions(murder rate <= than 1, in the west region)
TRUE&TRUE
TRUE&FALSE
FALSE&FALSE
murders$region
west <- murders$region == "West"
safe <- murder_rate <= 1
index <- safe & west
#index will only be true when both entries(safe & west) are true
#which states are both safe and west? access index from murder states
murders$state[index]
#Logical operators < menor, <= menor o igual, > mayor, >= mayor o igual, == equal, != not
equal, ! NOT, | OR, & AND
#which-match-%in% functions: to indexing-clasificar that use logical operators
#which function: to get the TRUE entries of a logical vector
x <- c(FALSE,TRUE,FALSE,TRUE,TRUE,FALSE)
which(x)
#index numbers of x that are true
index <- which(murders$state =="Massachusetts")
index
murder_rate[index]
#which gives the TRUE index from the state massachusetts, then access the murder rate
which(murders$state =="Massachusetts")
murder_rate[which(murders$state =="Massachusetts")]
murder_rate[which(murders$state =="California")]
#match function: to find several entries of a vector and get the index
index <- match(c("New York","Florida","Texas"),murders$state)
index
murders$state[index]
murder_rate[index]
#the entries of a second vector are matched by an index to the entries of the first
vector
#match = encontrar el índice-rows from New York, Florida, Texas en murders$state
#match(c-concatenate(1st vector with three entries: NY FL TX,2nd vector: murders$state)
#function %in% to know if whether or not entries of a first vector are in the second
vector
x <- c("a","b","c","d","e")
y <- c("a","d","f")
x %in% y
y %in% x
#entries of vector y that are %in% vector x: true true false
c("Boston","Dakota","Washington") %in% murders$state FALSE FALSE TRUE
murders$state %in% c("Boston","Dakota","Washington") 50 FALSE, 1 TRUE
#concatenate the three names and use %in% to find out if they are US states
#Answer: Boston and Dakota are false-not state names, Whashington is true-it's a state
#useful functions to see subset-subconjunto data sets into strates-estratos
match(c("New York","Florida","Texas"),murders$state)
which(murders$state %in% c("New York","Florida","Texas"))
#matchc() and which%in%c() get the same index results
#Exercises 2.14
library(dslabs)
data("murders")
murder_rate <- murders$total/murders$population*100000
murder_rate
low <- murder_rate < 1
low
#define low as murder_rate less than 1, then access the appropriate state names
which(low)
murders$state[low]
#report the states in the Northeast and with murder rates lower than 1
#first use the logical and-& to connect murder rates lower than 1 to Northeast region
(murder_rate < 1)&(murders$region == "Northeast")
low&(murders$region == "Northeast")
#2nd which function to get the index, then [] access all state names or one by one with
the index numbers
murders$state[low&(murders$region == "Northeast")]
which(low&(murders$region == "Northeast"))
murders$state[20]
murders$state[30]
murders$state[46]
mean(murder_rate)
murders$state[murder_rate < 2.779]
#get the average of murder rate and the state names below the average
index <- match(c("AK","MI","IA"),murders$abb)
murders$state[index]
#identify, match states with the three abb in index and then extract the state names
c("MA","ME","MI","MO","MU") %in% murders$abb
#Answer: MA,ME,MI,MO are true abb, MU is false not an abb
!c("MA","ME","MI","MO","MU") %in% murders$abb
#use ! NOT logical operator to know which one is not an abb
#plot function to make scatterplots-graficos de dispersion
x <- murders$population / 10^6
y <- murders$total
plot(x,y)
plot(murders$population,murders$total)
#there's a strong relationship between pop in millions and total gun murders
#with function: to avoid accessing the variable murders twice (for pop and tot)
with(murders, plot(population,total))
#Hist function: to get a histogram about of the type and frequency of a value
x <- with(murders, total/population*100000)
hist(x)
hist(murders$total/murders$population*100000)
hist(murder_rate)
#which is the state (frequency 1) with murder rate more than 15?
which.max(murder_rate)
murders$state[9]
murders$state[which.max(x)]
murders$state[which.max(murder_rate)]
#use boxplot function to make the graph and compare murder rate per region
boxplot(murder_rate~murders$region, data=murders)
boxplot(murder_rate~murders$region)
#tilde operator ~ to separate left and right hand sides
x <- matrix(1:120, 12, 10)
image(x)
#image function to display the values in a matrix
population_in_millions <- murders$population/10^6
total_gun_murders <- murders$total
plot(population_in_millions,total_gun_murders)
#get the variables in the log10 scale for a better insight
population_in_millions <- log10(murders$population/10^6)
total_gun_murders <- log10(murders$total)
plot(population_in_millions,total_gun_murders)
hist(murders$population)
boxplot(murders$population~murders$region)
boxplot(population~region, data=murders)
#three graphs, plot of total murders by pop in millions, hist of pop states, boxplot of
pop by region
A dataset is a collection of values, usually numbers (if quantitative) or strings (if
qualitative). Values are organised in two ways. Every value belongs to a variable and an
observation. A variable contains all values that measure the same underlying attribute
(like height, temperature, duration) across units. An observation contains all values
measured on the same unit (like a person, or a day, or a race) across attributes.
Tidy data is a standard way of mapping the meaning of a dataset to its structure. A
dataset is messy or tidy depending on how rows, columns and tables are matched up with
observations, variables and types. In tidy data:
-Each variable forms a column.
-Each observation forms a row.
-Each type of observational unit forms a table.
Most messy datasets can be tidied with a small set of tools: gathering, separating and
spreading gather() separate() spread() - na.rm() arrange() mutate() select() unique()
library(tidyverse)
#tidyverse package includes DPLYR to manipulate data frames, PURRR to work with
functions, GGPLOT2 to make graphs
#tidy format: data frames where each row is an observation and columns represent
variables available for each obs.
murders
str(murders)
#when data is not in tidy format "en orden" needs to be reshaped-data wrangling-disputa
de datos
#use tidy formats for both inputs and outputs of a data frame to get better data analysis
library(dplyr)
#dplyr package from tidyverse includes common functions to manipulate data frames
#mutate function: to change an existing column or add a new one that changes the entire
data frame
library(dslabs)
data("murders")
murders <- mutate(murders, rate = total/population*100000)
murders <- mutate(murders, rate = murders$total/murders$population*100000)
head(murders)
str(murders)
#dataframe name <- mutate(1st argument-dataframe we are going to manipulate, 2nd
argument-name=value of the new variable we want to create)
#mutate is similar to with, murders is provided on the 1st argument so it isn´t neccesary
again on the 2nd argument
#To get back a data frame without the mutation-rate variable, load again library(dslabs)
data("murders")
#a dataframe variable name typed inside a dplyr function remits to a column name, NOT to
an object in the workspace
library(dplyr)
murders <- mutate(murders, rate = total/population*100000)
#DONT FORGET! load again dplyr if R was previously closed
#Filter function: to filter the data frame by a subset-subconjunto of rows-filas
#filter(1st argument-dataframe, 2nd argument-conditional statement declaracion)
filter(murders, rate <= 0.71)
#filter gets the true conditionals for a dataframe, rates less or equal than 0.71
#Select function: get a data subset-subconjunto by selecting specific columns
#to select just the columns we want to work with
select(murders,state,region,rate)
filter(select(murders,state,region,rate), rate <= 0.71)
#select(1st argument-dataframe, 2nd or more arguments-variables)
new_table <- select(murders,state,region,rate)
filter(new_table, rate <= 0.71)
#pipe operator %>% function: to send the results of a function to another one by a pipe-
tuberia
#the pipe %>% helps to avoid intermediate objects as new_table, we get the same result
with %>%
murders %>% select(state,region,rate) %>% filter(rate <=0.71)
#the pipe sends the result from the left to right side-de lado izquierdo a derecho
16 %>% sqrt()
16 %>% sqrt() %>% log2
log2(sqrt(16))
#the pipe reads-analiza from left to right, opposite than how functions are read by
default(right to left)
16 %>% sqrt() %>% log(base=2)
#dplyr functions takes all the data as the 1st argument
#1st argument before the pipe is the 1st argument of the next function
#16 is 1st arg of sqrt(), and sqrt() is the 1st arg of log2()
#the pipe %>% works well in tidyverse packages like dplyr where the 1st argument is the
entrada-imput data
#4.4 Exercises
library(dplyr)
library(dslabs)
data("murders")
murders <- mutate(murders, rate = total/population*100000)
x <- c(1,3,5,7,9)
rank(x)
rank(-x)
#inverse rank-jearquia de mayor a menor, use - before the value, Rank by default:
jerarquia de menor a mayor
rank(murder_rate)
rank(-murder_rate)
#add a column rank with inverse murder_rate ranks
murders <- mutate(murders, rank = rank(-murder_rate))
head(murders)
#use select function to show state names and abb WITHOUT redefine murders object
select(murders, state, abb)
#subsetting Rows: filter() function
#subsetting Columns: select() function
filter(murders, state=="New York")
which.max(murders$rate)
murders$state[9]
#use filter to show the top 5 states with the highest murder rates
filter(murders, rank <=5)
#remember filter() shows the whole row previously specified by the conditionals-
relational operators
#Remove rows and columns != operator "not equal"
no_florida <- filter(murders, state !="Florida")
View(no_florida)
#using != and filter() we get a dataframe without the state Florida
#nrow ncol functions to know the number of rows/columns
nrow(no_florida)
no_south <- filter(murders, region !="South")
nrow(no_south)
#use %in% inside filter to see the data from New York and Texas
filter(murders, state %in% c("New York","Texas"))
#use %in% inside filter to see the data from Northeast and west regions, create a new
dataframe called murders_nw
murders_nw <- filter(murders, region %in% c("Northeast","West"))
murders_nw
nrow(murders_nw)
#example of how to filter by two conditions:
filter(murders, population < 5000000 & region =="Northeast")
#create a new dataframe with northeast, west regions, and murder rates less than 1
#to see where the error occurred
rlang::last_error()
rlang::last_trace()
ncol(murders)
nrow(murders)
my_states <- filter(murders, region%in% c("Northeast","West") & rate <=1)
my_states
my_states <- filter(murders, (region=="Northeast"|region=="West") & rate <=1)
my_states
#filter by two conditions use logical operatos &, == and | between parentheses(), or %in%
and c()
select(my_states, state, rate, rank)
#4.6 Exercises
library(dplyr)
library(dslabs)
data("murders")
murders <- mutate(murders, rate=total/population*100000, rank=rank(-murder_rate))
#redefine murders, add again rate and rank variables
my_states <- filter(murders, region%in% c("Northeast","West") & rate <=1)
select(my_states, state, rate, rank)
#use the pipe %>% and select() in one line of code to get the same result and avoid to
define the object my_states
mutate(murders, rate=total/population*100000, rank=rank(-murder_rate)) %>%
select(state,rate,rank)
filter(murders, region%in% c("Northeast","West") & rate <=1) %>% select(state,rate,rank)
#the pipe %>% es como mandar todo por una tuberia, no es necesario redefinir dataframe-
murders in the second stage
#reload murders, in one line of code with the pipe %>% add mutate() rate and rank, filter
by Northeast, west and rate<=1, select state, rate and rank
library(dplyr)
library(dslabs)
data("murders")
my_states <- murders %>% mutate(rate=total/population*100000, rank=rank(-murder_rate)) %>
% filter(region%in% c("Northeast","West") & rate <=1) %>% select(state,rate,rank)
my_states
grades <- data.frame(names=c("John","Juan","Jean","Yao"), exam_1=c(95,80,90,85),
exam_2=c(90,85,85,90))
grades
#by default data.frames turns character into factors
#class dataframes are factors by default
class(grades$names)
#define the argument stringsAsFactors=FALSE at the end to get characters
grades <- data.frame(names=c("John","Juan","Jean","Yao"), exam_1=c(95,80,90,85),
exam_2=c(90,85,85,90), stringsAsFactors = FALSE)
class(grades$names) “character”
class(grades$exam_1) “numeric”
class(grades$exam_2) “numeric”
#there's a strong relationship between pop in millions and total gun murders
murders$state[which.max(murder_rate)]
#Section 3 Assessment
library(dslabs)
data("heights")
options(digits=3)
#to define the number of digits if the value is numeric options(digits=number), default
is 7 digits
class(heights)
head(heights)
heights$height
mean(heights$height)
ind <- heights$height >68.3
ind
sum(ind)
ind&(heights$sex =="Female")
sum(ind&(heights$sex =="Female"))
mean(heights$sex =="Female")
#mean function on a logical vector returns the proportion of TRUE observations
min(heights$height)
which.min(heights$height)
heights$height[1032]
match(50,heights$height)
heights$sex[1032]
#find the sex of the min height individual
which.max(heights$height)
heights$height[1017]
max(heights$height)
#min height 50, max height 82.7, create a vector with the integer values between min and
max heights
x <- 50:82
x
x <- heights$height[50:82]
x
#how many of x values are not in heights dataset?
x <- 50:82
!x %in% heights$height
sum(!x %in% heights$height)
#create a new column of heights in centimeters ht_cm and save it in a new dataframe
heights2, 1 inch=2.54cm
library(dplyr)
heights$height*2.54
heights2 <- mutate(heights, ht_cm=height*2.54)
head(heights2)
heights2$ht_cm[18]
mean(heights2$ht_cm)
#create a dataframe named females by filtering heights2 only for female individuals
female <- filter(heights2, sex=="Female")
female
rm(female)
females <- filter(heights2, sex=="Female")
females
nrow(females)
mean(females$ht_cm)
library(dslabs)
data("olive")
head(olive)
plot(olive$palmitic, olive$palmitoleic)
#answer: there's a positive linear relationship between palmitic and palmitoleic olive's
fatty acid
hist(olive$eicosenoic)
#answer: eicosenoic acid is commonly below 0.05%
boxplot(palmitic~region, data=olive)
#Answer: southern italy has the highest media values though as well the most variable
values
#R is a data analysis environment and programming language
#three key basic programming concepts: CONDITIONAL EXPRESSIONS, FOR-LOOPS, FUNCTIONS.
#other programming functions to learn: split, cut, do.call, reduce, data.table package
#if-else statement-control flow: if(conditional){print(expression)}
else{print(expression)}
a <- 0
if(a!=0){print(1/a)}else{print("No reciprocal for 0")}
[1] "No reciprocal for 0"
#a <- 0, if-si (a diferente != 0){print-imprimir(1/a)} else-si no{print-imprimir("No
reciprocal for 0")}
a <- 2
if(a!=0){print(1/a)}else{print("No reciprocal for 0")}
[1] 0.5
#a <- 2, if-si(a diferente != 0){print-imprimir(1/a)} else-si no{print-imprimir("No
reciprocal for 0")}
#if(boolean condition){expressions()} else{alternative expressions()}
#if(TRUE)=print, perform this expression, if(FALSE)=print, perform else(TRUE) alternative
expression
library(dslabs)
data("murders")
murder_rate <- murders$total/murders$population*100000
#which states, if any, have murder rates lower than 0.5?
ind <- which.min(murder_rate)
murder_rate[ind]
ind
if(murder_rate[ind] <0.5){print(murders$state[ind])} else{print("No state has murder rate
low")}
#if we change murder rates lower than 0.25 we get the else alternative expression
if(murder_rate[ind] <0.25){print(murders$state[ind])} else{print("No state has murder
rate low")}
ind <- which(murder_rate <0.5)
murder_rate[ind]
ind
if(murder_rate[ind] <0.5){print(murders$state[ind])} else{print("No state has murder rate
low")}
if(murder_rate[ind] <0.25){print(murders$state[ind])} else{print("No state has murder
rate low")}
#If murder_rate <0.5 print the states New Hampshire& Vermont, if murder_rate <0.25 print
else: No state has murder rate low
#ifelse(logical, answer 1 if, answer 2 else) there are three arguments
#when logical is true: if-answer 1 is returned, when logical is false: else-answer 2 is
returned
a <- 0
ifelse(a > 0, 1/a, NA)
a <- 2
ifelse(a > 2, 1/a, NA)
ifelse(a > 1, 1/a, NA)
#when logical is true: if-answer 1 is returned, when logical is false: else-answer 2 is
returned
#ifelse function works on vectors, examines each entry of the logical vector and returns
an answer for each entry
#ifelse(logical vector, if-answer1 when logical vector is TRUE, else-answer2 when logical
vector is FALSE)
a <- c(0,1,2,-4,5)
ifelse(a > 0, 1/a, NA)
#result: 0>0 FALSE-else NA, 1>0 TRUE-if 1, 2>0 TRUE-if 0.5, -4>0 FALSE-else NA, 5>0 TRUE-
if 0.2
#replace NA-missing values by zeros or another value with ifelse function
library(dslabs)
data("na_example")
na_example
is.na(na_example) #shows NA-missing values
sum(is.na(na_example)) #gives the sum of TRUE-NA-missing values
no_nas <- ifelse(is.na(na_example), 0, na_example)
sum(is.na(no_nas))
#when ifelse function finds NA in a vector, replace by zeros-answer1, or replace by
na_example values-answer2
#any function: if any of the entries of a logical vector is TRUE, any gets a TRUE result
#any: at least one element is TRUE=TRUE, if all elements are FALSE=FALSE, if all elements
are TRUE=TRUE
#all function: if all the entries of a logical vector are TRUE, all gets a TRUE result
z <- c(TRUE,TRUE,FALSE)
any(z)
all(z)
z <- c(FALSE,FALSE,FALSE)
any(z)
all(z)
z <- c(TRUE,TRUE,TRUE)
any(z)
all(z)
#the any() & all() functions evaluate logical vectors
#Define functions to perform the same operations over and over when the function does not
already exist in R
#a new function average can be defined as sum()/length(), the function mean is already
defined in R like this.
avg <- function(x){
s <- sum(x)
n <- length(x)
s/n
}
#write the whole code in different lines(shift&enter) to define well the new function
avg <- function(x){s <- sum(x)
n <- length(x)
return(s/n)}
#return(value) or just the final value we want at the end
x <- 1:100
avg(x)
mean(x)
identical(mean(x), avg(x))
#variables defined inside function(){} are only available for that call function, they
are not in the workspace
s <- 3
avg(1:10)
s
#the variable s in the workspace is 3, s as sum(x) is only available inside avg function
defined
#name_function <- function(variable names-args){perform operations-expressions on
variable names Value on last line or return(value)}
#function() tells R you are about to define a new function <-
#media geometrica=(x1*x2*xn)^1/n se usa en datos en funcion de log, porcentuales,
geometricos, valores pequeños, razon
#media aritmetica=(x1+x2+xn)/n se usa en datos de invervalo, de razon, analisis
inferencial, distribuciones uniformes
avg <- function(x,arithmetic=TRUE){
n <- length(x)
ifelse(arithmetic, sum(x)/n, prod(x)^(1/n))}
avg <- function(x, arithmetic=TRUE){
s <- sum(x)
n <- length(x)
ifelse(arithmetic, s/n, prod(x)^(1/n))}
#arithmetic by default, =FALSE:it's geometric, define n, ifelse()if it's arithmetic print
the average, if not print the geometric mean
#the last line return(value) is the ifelse for arithmetic or geometric average
#to check the formula 1+2+...+n = n*(n+1)/2 let's create a function that computes the sum
of n
compute_s_n <- function(n){
x <- 1:n
sum(x)}
compute_s_n(3)
#from 1:n(3) the sum is 1+2+3=6
compute_s_n(100)
#from 1:n(100) the sum is 1+2+3+...+100=5050
compute_s_n(2)
#from 1:n(2) the sum is 1+2= 3
#compute_s_n: takes a value-name variable function(n) creates a vector 1 through n x <-
1:n and returns sum(x)
#forloops: to perform the same task over&over changing the value of n, 1define n range
2change the value 3evaluate the expression as a loop
#for(i-variable in :sequence){perform operations-expressions on i-variable}
#at the end of the loop the value of i is the last value of the seq-range of values
for(i in 1:5){print(i)}
i
for(i in seq(1:5)){print(i)}
#return(value) for defining new functions in R
#print() to see the value defined in an object, useful to write function expressions and
create them
#vector function creates a vector of a given length-longitud and mode-expression
#to compute the sum for the values from 1 to 25, 1st create a vector(length25) and store
the results while compute them
m <- 25
s_n <- vector(length = m)
for(n in 1:m){
s_n[n] <- compute_s_n(n)}
#inside the loop we can call the function created before compute_s_n
#n is the value that is changing from 1 through 25
n <- 1:m
plot(n, s_n)
lines(n, n*(n+1)/2)
lines(n, s_n)
#using plot or lines we obtain the same graph, remember s_n is n*(n+1)/2
#get the same result using 25 instead of m
s_n <- vector(length = 25)
for(n in 1:25){
s_n[n] <- compute_s_n(n)}
n <- 1:25
plot(n, s_n)
lines(n, n*(n+1)/2)
lines(n, s_n)
#use c() concatenate instead of vector function and get the same result
s_n <- c(1:25)
for(n in 1:25){
s_n[n] <- compute_s_n(n)}
n <- 1:25
plot(n, s_n)
#s_n[n] the n between [] shows the n'th-sequence ordered numbers, the same is: s_n[1]
1st, s_n[2] 2nd...s_n[25] 25th
#it's necessary to redefine the vector like s_n[n] inside the for-loop to compute the sum
of all n from s_n vector
#use [] to select more than one element, whereas [[]] and $ select a single element
#r-cheat-sheet.key in google to see an easy summary of r
#For-loops are useful when working with vectors, to iterate-reiterar each element in a
vector and do some computation
#We can also use for-loops to create or extend vectors, this inside the for-loop function
x <- c(1,3,4,7)
x
#For any integer-entero i between 1 and 4, x[i] denotes the i'th element of the vector.
x[1]
x[2]
x[3]
x[4]
#1st define variable n to store the number of elements in x using length function
#2nd use the variable i to loop through the numbers n 1, 2, 3, 4 (length of x) inside the
for-loop function
n <- length(x)
for(i in 1:n){
x[i] <- x[i]+i}
x
#The for-loop is equivalent to running the four commands
x[1] <- x[1]+1
x[2] <- x[2]+2
x[3] <- x[3]+3
x[4] <- x[4]+4
#i integer is the numeric index that can be []extracted from the vector x, inside the
for-loop x[i] <- value
#Ex. the next for-loop creates inside a vector with five components where each component
is double than the previous
n <- 4
a <- 1
for(i in 1:n){
a[i+1] <- 2*a[i]}
a
#For-loops in R from mathinsight.org
#different packages use the same name for a function but they do completely different
things
#the message recieved when we load the package tell us which functions have the same name
in other package
search()
library(dplyr)
library(dslabs)
search()
#dplyr and dslabs packages are added in the namespaces, see that order of loaded packages
using search()
#force the use of a specific namespace by using :: double colons
#double colons :: to use a function in a package without loading the entire package
stats::filter
dplyr::filter
#other useful functions to learn on the web: split, cut, quantile, reduce, identical,
unique
#Functionals: more powerful and similar functions used instead of for-loops are apply,
sapply, tapply, mapply, vapply, replicate
#vectorization is preferred over for-loops, a vectorized function will apply the same
operation on each vector
x <- 1:10
sqrt(x)
options(digits=3)
sqrt(x)
y <- 1:10
x*y
n <- 1:25
compute_s_n(n)
#compute_s_n function does not work on each element of n that's why previously we used
for-loops
#Functionals apply the same function on each entry of a vector, matrix, data-frame or
list
x <- 1:10
sapply(x,sqrt)
#sapply function: each element of x is passed-concatenated onto the sqrt function and the
result is returned of the same length of x
#sapply(x, function) returns by default a vector-or matrix as x is
lapply(x, sqrt)
#lapply(x, function) returns a list of the same length of x
n <- 1:25
sapply(n, compute_s_n)
#sapply let us to compute_s_n of each element in n without using a for-loop
rep(x,2)
#repeat the vector x 2 times
quantile(100) #separa en cuantiles de porcentaje 25, 50, 75 y 100 %
range(x) #gives the min and the max of the variable x
#Exercises 3.6
x <- c(1,2,-3,4)
if(all(x>0)){print("All positives")}else{print("Not all positives")}
z <- c(TRUE, FALSE, FALSE)
any(z)
all(z)
!z
any(!z)
all(!z)
#answer: any gives TRUE is any entry is true, all gives false entries, are not all the
same
#nchar function: how many characters long a vector is
nchar(murders$state)
nchar(murders$abb)
ifelse(nchar(murders$state)>8, new_names <- murders$abb, new_names <- murders$state)
ifelse(nchar(murders$state)>=8, new_names <- murders$abb, new_names <- murders$state)
otra forma definiendo la variable new_names y usando ifelse adentro
new_names <- ifelse(nchar(murders$state)>8, murders$abb, murders$state)
#show in one line of code and store in new_names the state names with number of
characters >= 8 with abbreviations
sum(1:5000)
sum_n <- function(n){sum(1:n)}
sum_n(1)
sum_n(5000)
n <- 1:5000
sapply(n, sum_n)
max(sapply(n, sum_n))
#define the function sum_n, then define n as seq 1:5000 and sapply the sum in all values
of n
for(n in 1:5000){
s_n <- sum_n(n)}
max(s_n)
for(n in 1:5000){
s_n[n] <- sum_n(n)}
max(s_n)
#confirm the answer is ok using for-loop
x <- seq(0,30,10)
x
y <- seq(0,15,5)
y
altman_plot <- function(x,y){plot(sum(x)-sum(y))}
altman_plot(x,y)
sum(x)-sum(y)
x-y
sum(x-y)
altman_plot <- function(x,y){plot(x+y,y-x)}
compute_s_n <- function(n){
x <- 1:n
sum(x^2)}
compute_s_n(10)
#redefine compute_s_n function from 1:n to get the sum of n^2, get the result when n is
10
compute_s_n <- function(n){
n <- 1:10
sum(n^2)}
compute_s_n(10)
compute_s_n(n)
#two ways to get the same result 385 when n <- 10
#define s_n numerical to 25 with vector function, store the result of the sum from 1 to
25 using sapply and for-loop
s_n <- vector("numeric", 25)
n <- 1:25
for(n in 1:25){
s_n[n] <- sum(n)}
s_n
sapply(s_n,sum)
#by for-loop and sapply we get the same result, an s_n vector from 1 to 25
#the empty vector s_n created with vector function works as a placeholder-marcador to be
later well defined by sapply or for-loop functions
n <- 1:25
n
plot(s_n,n)
#both are vectors, s_n and n are from 1 to 25, we get a lineal plot, Check variables are
well defined after using forloops
identical(s_n,n)
n*(n+1)*((2*n)+1)/6
identical(s_n,(n*(n+1)*((2*n)+1)/6))
#s_n indetical n TRUE, s_n identical n*(n+1)*((2*n)+1)/6 FALSE
Install.packages(“knitr”)
#section 4 assessment
library(dslabs)
data("heights")
library(dplyr)
#use an ifelse to define female as 1 and male as 2
ifelse(heights$sex =="Female",1,2)
sum(ifelse(heights$sex =="Female",1,2))
#answer: the sum of the vector is 1862
#use an ifelse to print heights when they are greater than 72 inches and 0 if not
ifelse(heights$height >72,heights$height,0)
mean(ifelse(heights$height >72,heights$height,0))
#answer: the mean of the vector is 9.653
#1 foot = 12 inches, create a function inches_to_ft that takes a number in feet and
return the number in inches
x <- 144
x/12
inches_to_ft <- function(x){
x*1/12}
inches_to_ft(144)
inches_to_ft(heights$height)
y <- inches_to_ft(heights$height)
y<5
which(y<5)
length(which(y<5))
sum(y<5)
#answer: 144 inch are 12 feet and there are 20 individuals with heights less than 5 feet
#Give an integer x, the factorial of x is called x! and is the product of all integers up
to and including x
#The factorial() function computes factorials in R. For example, factorial(4) returns 4!
= 4 × 3 × 2 × 1 = 24
m <- 10
f_n <- vector(length = 10)
f_n <- vector(length = m)
for(n in 1:m){
f_n[n] <- factorial(n)}
f_n
sapply(f_n,factorial)
m <- 1:10
sapply(m,factorial)
#answer: get the vector f_n from 1, 2 to 3628800 by forloops, sapply