R Practice 2

1 Tipos de datos

x <- 28
print(class(x))
y <- "R is fantastic"
print(class(y))
z <- TRUE
print(class(z))


[1] "numeric"

[1] "character"

[1] "logical"

1.1 Vectores

datos <- c(2,1,3,-1,-10,0,0,1)
print(datos)


[1]   2   1   3  -1 -10   0   0   1

1.1.1 Number sequences

print(1:10)
print(seq(10))
print(seq(1, 10, by=3))
print(rep(1:4, 2))

 [1]  1  2  3  4  5  6  7  8  9 10

 [1]  1  2  3  4  5  6  7  8  9 10

[1]  1  4  7 10

[1] 1 2 3 4 1 2 3 4

1.1.2 Access

print(datos)
print(datos[1])
print(datos[-4])
print(datos[c(1,3,5)])
print(datos[3:5])
print(v <- datos>1)
print(datos[v])

[1]   2   1   3  -1 -10   0   0   1

[1] 2

[1]   2   1   3 -10   0   0   1

[1]   2   3 -10

[1]   3  -1 -10

[1]  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE

[1] 2 3

1.1.3 Index names

Vectors can have names on their indexes.

v = seq(1:5)
names(v) <- c("Lun", "Mar", "Mie", "Jue", "Vie")
print(v)

print(names(v))


Lun Mar Mie Jue Vie 
  1   2   3   4   5

[1] "Lun" "Mar" "Mie" "Jue" "Vie"

1.1.4 Fncs.

If a function does not return anything, then the vector may have a NA value.

print(length(datos))
print(min(datos))
print(max(datos))
print(sum(datos))
print(mean(datos))
print(median(datos))
print(sort(datos))
print(unique(datos))
print(which(datos > 1))
print(which.max(datos))
print(which.min(datos))

[1] 8

[1] -10

[1] 3

[1] -4

[1] -0.5

[1] 0.5

[1] -10  -1   0   0   1   1   2   3

[1]   2   1   3  -1 -10   0

[1] 1 3

[1] 3

[1] 5

plot(datos)

1.1.5 Factor

Nominal (unordered) or categorical (ordered) factor.

Used on Likert scale.

day_vector <- c('evening', 'morning', 'afrernoon', 'midday', 'midnight', 'evening')

factor_day <- factor(day_vector, order=TRUE, levels=c('morning', 'midday',
  'afternoon', 'evening', 'midnight'))

color_vector <- c('blue', 'red', 'yellow', 'green', 'white')

factor_color <- factor(color_vector)
print(factor_color)


[1] blue   red    yellow green  white 
Levels: blue green red white yellow

1.2 Data Frame

A new data frame can be created by using data.frame function.

a <- c(10, 20, 30 , 40)
b <- c('book', 'pen', 'textbook', 'pencil_case')
c <- c(TRUE, FALSE, TRUE, FALSE)
d <- c(2.5, 8, 10, 7)

df <- data.frame(a, b, c, d)
print(df)


   a           b     c    d
1 10        book  TRUE  2.5
2 20         pen FALSE  8.0
3 30    textbook  TRUE 10.0
4 40 pencil_case FALSE  7.0

Change names on the data frame.

names(df) <- c("ID", "items", "store", "price")
print(df)


  ID       items store price
1 10        book  TRUE   2.5
2 20         pen FALSE   8.0
3 30    textbook  TRUE  10.0
4 40 pencil_case FALSE   7.0

Print the structure and description of the data frame.

str(df)

'data.frame':	4 obs. of  4 variables:
 $ ID   : num  10 20 30 40
 $ items: Factor w/ 4 levels "book","pen","pencil_case",..: 1 2 4 3
 $ store: logi  TRUE FALSE TRUE FALSE
 $ price: num  2.5 8 10 7

1.2.1 Slices

print(df[1,2])
print(df[1:2,])
print(df[1:3, 3:4])
print(df[,1])
print(df[,"ID"])

[1] book
Levels: book pen pencil_case textbook

  ID items store price
1 10  book  TRUE   2.5
2 20   pen FALSE   8.0

  store price
1  TRUE   2.5
2 FALSE   8.0
3  TRUE  10.0

[1] 10 20 30 40

[1] 10 20 30 40

1.2.2 Add columns

The amount of data in the vector must be of the same length of the data frame's columns.

quantity <- c(10, 35, 40, 5)

df$quantity <- quantity

print(df)


  ID       items store price quantity
1 10        book  TRUE   2.5       10
2 20         pen FALSE   8.0       35
3 30    textbook  TRUE  10.0       40
4 40 pencil_case FALSE   7.0        5

1.2.3 Subset

print(subset(df, subset=price > 5))

  ID       items store price quantity
2 20         pen FALSE     8       35
3 30    textbook  TRUE    10       40
4 40 pencil_case FALSE     7        5

1.2.4 Merge - Joins

Full merge is when both columns has data. Partial merge is when one of the column has data that is not in the other, then a NA value will be assigned.

x <- c("n1", "n2")
y <- c(1, 2)
z <- data.frame(x, y)

print(z)

t <- c("a", "b")
s <- c(1, 2)
o <- data.frame(t, s)

print(o)

q <- merge(z, o, by.x = "y", by.y = "s")
print(q)


   x y
1 n1 1
2 n2 2

  t s
1 a 1
2 b 2

  y  x t
1 1 n1 a
2 2 n2 b

Partial match. The following snippet adds a new row into z.

add_z <- c("n2", 3)
z <- rbind(z, add_z)

print(z)

# Partial match
q <- merge(z, o, by.x = "y", by.y = "s", all.x = TRUE)
print(q)

# Total  match 
q <- merge(z, o, by.x = "y", by.y = "s")
print(q)


   x y
1 n1 1
2 n2 2
3 n2 3

  y  x    t
1 1 n1    a
2 2 n2    b
3 3 n2 <NA>

  y  x t
1 1 n1 a
2 2 n2 b

2 Plotting

x <- 1:15
y <- pi*x^2

print(x)
print(y)


 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15

 [1]   3.141593  12.566371  28.274334  50.265482  78.539816 113.097336
 [7] 153.938040 201.061930 254.469005 314.159265 380.132711 452.389342
[13] 530.929158 615.752160 706.858347

plot(x, y, xlab="Radio", ylab=expression(Area == pi*r^2))

Correlation. If p-value < 0.05, then they are correlated. If cor is near 1, the more correlated they are.

res <- cor.test(x,y)
print(res)


	Pearson's product-moment correlation

data:  x and y
t = 15.029, df = 13, p-value = 1.348e-09
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.9168624 0.9910175
sample estimates:
      cor 
0.9724093

celsius <- -25:30
fahrenheit <- 9/5*celsius+32
x <- data.frame(Celsius=celsius, Fahrenheit=fahrenheit)
plot(x)

plot(celsius~fahrenheit)

2.1 Bar plot

library(MASS)
t <- table(survey$Sex, survey$Smoke)
print(t)



       Heavy Never Occas Regul
Female     5    99     9     5
Male       6    89    10    12

barplot(table(survey$Sex, survey$Smoke), beside=TRUE)

barplot(t, beside=TRUE)

barplot(t, main="Regularidad por sexo", legend.text=c("Female", "Male"),
  beside=TRUE, col = c("pink", "black"))

2.2 Pie plot

f <- table(survey$Sex)
print(f)


Female   Male 
   118    118

f <- table(survey$W.Hnd)
print(f)


Left Right 
  18   218

pie(f)

z <- round(f/sum(f)*100)

print(z)
lbs <- paste(names(f), z, "%", sep=" ")
print(lbs)


 Left Right 
    8    92

[1] "Left 8 %"   "Right 92 %"

pie(f, main="Cantidad de encuestados diestros y zurdos", labels=lbs)

pie(f, main="Cantidad de encuestados diestros y zurdos", labels=lbs,
  col=rainbow(length(lbs)))

2.3 Histograms

hist(survey$Age)

This is a normal distributed histogram.

hist(survey$Height)

Testing normality.

print(shapiro.test(survey$Age))


	Shapiro-Wilk normality test

data:  survey$Age
W = 0.45642, p-value < 2.2e-16

If p-value > 0.05 then it is normal.

print(shapiro.test(survey$Height))


	Shapiro-Wilk normality test

data:  survey$Height
W = 0.98841, p-value = 0.08844

2.3.1 Example

#........................................Histograma

tiempo <- c(11.50 , 10.26, 10.08, 13.00, 11.14, 13.73, 13.41, 10.44, 11.36, 14.40, 11.64, 12.39, 12.82, 14.25, 15.41, 14.35, 9.35, 12.40, 9.04, 15.30, 14.79, 15.27, 10.63, 14.30, 15.48, 14.80, 8.78, 14.00, 13.09, 10.00, 12.20, 11.70, 15.37, 11.81, 10.06, 12.49, 8.58, 11.32, 12.20, 12.45, 11.28, 12.60, 14.36, 13.08, 13.50, 12.68, 9.19, 14.32, 12.17, 9.10)
sum <- min(tiempo)
for (i in 1:7) { 
  sum <- sum + 0.99 
  print(sum) }

#........................................Barplot 

x <- c(rep("muy de acuerdo",10), rep("de acuerdo", 10), rep("poco de acuerdo", 10), rep("para nada de acuerdo", 10))

factor_x <- factor(x, order=TRUE, levels = c("para nada de acuerdo", "poco de acuerdo", "de acuerdo", "muy de acuerdo"))
w <- as.numeric(factor_x)

y <- c(rep("hombre",18), rep("mujer",2), rep("mujer",18), rep("hombre",2))


#........................................Boxplot
g <- c(36,25,37,24,39,20,36,45,31,31,39,24,29,23,41,40,33,24,34,40)
g


[1] 9.57
[1] 10.56
[1] 11.55
[1] 12.54
[1] 13.53
[1] 14.52
[1] 15.51

 [1] 36 25 37 24 39 20 36 45 31 31 39 24 29 23 41 40 33 24 34 40

tiempo <- c(11.50 , 10.26, 10.08, 13.00, 11.14, 13.73, 13.41, 10.44, 11.36, 14.40, 11.64, 12.39, 12.82, 14.25, 15.41, 14.35, 9.35, 12.40, 9.04, 15.30, 14.79, 15.27, 10.63, 14.30, 15.48, 14.80, 8.78, 14.00, 13.09, 10.00, 12.20, 11.70, 15.37, 11.81, 10.06, 12.49, 8.58, 11.32, 12.20, 12.45, 11.28, 12.60, 14.36, 13.08, 13.50, 12.68, 9.19, 14.32, 12.17, 9.10)

11.5
10.26
10.08
13
11.14
13.73
13.41
10.44
11.36
14.4
11.64
12.39
12.82
14.25
15.41
14.35
9.35
12.4
9.04
15.3
14.79
15.27
10.63
14.3
15.48
14.8
8.78
14
13.09
10
12.2
11.7
15.37
11.81
10.06
12.49
8.58
11.32
12.2
12.45
11.28
12.6
14.36
13.08
13.5
12.68
9.19
14.32
12.17
9.1

trange <- range(tiempo)
tmax <- max(tiempo)
tmin <- min(tiempo)
print(trange)
trange2 <- tmax-tmin
print(trange2)


[1]  8.58 15.48

[1] 6.9

int_clase <- sqrt(length(tiempo))
print(int_clase)
int_clase <- round(int_clase)
print(int_clase)


[1] 7.071068

[1] 7

amplitud <- trange2/int_clase
print(amplitud)
amplitud <- round(amplitud)
print(amplitud)


[1] 0.9857143

[1] 1

The number of classes in a histogram can be changed with the nclass parameter.

hist(survey$Height, nclass=20)

2.3.2 Acumulative (Frequent) Histograms

z$counts has the frequency of the elements in the histograms.

z <- hist(survey$Height)

print(z$counts)

8
16
29
45
33
30
26
14
6
2

z$counts <- cumsum(z$counts)

print(z$counts)

8
24
53
98
131
161
187
201
207
209

plot(z)

2.4 Box Plot

g <- c(36,25,37,24,39,20,36,45,31,31,39,24,29,23,41,40,33,24,34,40)
bp1 <- boxplot(g)

print(bp1)

$stats
     [,1]
[1,] 20.0
[2,] 24.5
[3,] 33.5
[4,] 39.0
[5,] 45.0

$n
[1] 20

$conf
         [,1]
[1,] 28.37717
[2,] 38.62283

$out
numeric(0)

$group
numeric(0)

$names
[1] "1"

horizontal plot

bp2 <- boxplot(g, horizontal=TRUE)

print(bp2)

$stats
     [,1]
[1,] 20.0
[2,] 24.5
[3,] 33.5
[4,] 39.0
[5,] 45.0

$n
[1] 20

$conf
         [,1]
[1,] 28.37717
[2,] 38.62283

$out
numeric(0)

$group
numeric(0)

$names
[1] "1"

bp1$stats returns the quartiles values.

print(sort(g))

print(bp1$stats)

 [1] 20 23 24 24 24 25 29 31 31 33 34 36 36 37 39 39 40 40 41 45

     [,1]
[1,] 20.0
[2,] 24.5
[3,] 33.5
[4,] 39.0
[5,] 45.0

2.4.1 Quartiles

g1 <- sort(g)
print(g1)


[1] 20 23 24 24 24 25 29 31 31 33 34 36 36 37 39 39 40 40 41 45

First quartile is the arithmetic media between the 4th quarter and the next values.

print(
  (g1[length(g1)/4] + g1[length(g1)/4 + 1] ) / 2
)


[1] 24.5

print(median(g1))

[1] 33.5

3 Correlation

print(min(survey$NW.Hnd, na.rm=TRUE))

[1] 12.5

plot(survey$Wr.Hnd, survey$NW.Hnd)

res <- cor.test(survey$Wr.Hnd, survey$NW.Hnd)
print(res)


	Pearson's product-moment correlation

data:  survey$Wr.Hnd and survey$NW.Hnd
t = 45.712, df = 234, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.9336780 0.9597816
sample estimates:
      cor 
0.9483103

To ensure that the Pearson method is applied, use the method parameter.

res <- cor.test(survey$Wr.Hnd, survey$NW.Hnd, method="pearson")
print(res)


	Pearson's product-moment correlation

data:  survey$Wr.Hnd and survey$NW.Hnd
t = 45.712, df = 234, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.9336780 0.9597816
sample estimates:
      cor 
0.9483103

3.1 Wilcox correlation

Some comparison between a Likert (nominal) with another factor it has to be applied with the corresponding parametric or non-parametric correlation.

x <- c(rep("muy de acuerdo",10), rep("de acuerdo", 10), rep("poco de acuerdo", 10), rep("para nada de acuerdo", 10))
factor_x <- factor(x, order=TRUE, levels = c("para nada de acuerdo", "poco de acuerdo", "de acuerdo", "muy de acuerdo"))
print(factor_x)

w <- as.numeric(factor_x)
print(w)

y <- c(rep("hombre",18), rep("mujer",2), rep("mujer",18), rep("hombre",2))
print(y)


 [1] muy de acuerdo       muy de acuerdo       muy de acuerdo      
 [4] muy de acuerdo       muy de acuerdo       muy de acuerdo      
 [7] muy de acuerdo       muy de acuerdo       muy de acuerdo      
[10] muy de acuerdo       de acuerdo           de acuerdo          
[13] de acuerdo           de acuerdo           de acuerdo          
[16] de acuerdo           de acuerdo           de acuerdo          
[19] de acuerdo           de acuerdo           poco de acuerdo     
[22] poco de acuerdo      poco de acuerdo      poco de acuerdo     
[25] poco de acuerdo      poco de acuerdo      poco de acuerdo     
[28] poco de acuerdo      poco de acuerdo      poco de acuerdo     
[31] para nada de acuerdo para nada de acuerdo para nada de acuerdo
[34] para nada de acuerdo para nada de acuerdo para nada de acuerdo
[37] para nada de acuerdo para nada de acuerdo para nada de acuerdo
[40] para nada de acuerdo
4 Levels: para nada de acuerdo < poco de acuerdo < ... < muy de acuerdo

 [1] 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1
[39] 1 1

 [1] "hombre" "hombre" "hombre" "hombre" "hombre" "hombre" "hombre" "hombre"
 [9] "hombre" "hombre" "hombre" "hombre" "hombre" "hombre" "hombre" "hombre"
[17] "hombre" "hombre" "mujer"  "mujer"  "mujer"  "mujer"  "mujer"  "mujer" 
[25] "mujer"  "mujer"  "mujer"  "mujer"  "mujer"  "mujer"  "mujer"  "mujer" 
[33] "mujer"  "mujer"  "mujer"  "mujer"  "mujer"  "mujer"  "hombre" "hombre"

The Wilcoxon test has a p-value less than 0.05 which means that there is correlation between the Likert data (w or x) that is influenced by the sex (y data).

m <- data.frame(w, y)
wilcox.test(w~y, alternative="two.sided", data=m)
print(m)


	Wilcoxon rank sum test with continuity correction

data:  w by y
W = 360, p-value = 8.405e-06
alternative hypothesis: true location shift is not equal to 0

Warning message:
In wilcox.test.default(x = c(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3,  :
  cannot compute exact p-value with ties

   w      y
1  4 hombre
2  4 hombre
3  4 hombre
4  4 hombre
5  4 hombre
6  4 hombre
7  4 hombre
8  4 hombre
9  4 hombre
10 4 hombre
11 3 hombre
12 3 hombre
13 3 hombre
14 3 hombre
15 3 hombre
16 3 hombre
17 3 hombre
18 3 hombre
19 3  mujer
20 3  mujer
21 2  mujer
22 2  mujer
23 2  mujer
24 2  mujer
25 2  mujer
26 2  mujer
27 2  mujer
28 2  mujer
29 2  mujer
30 2  mujer
31 1  mujer
32 1  mujer
33 1  mujer
34 1  mujer
35 1  mujer
36 1  mujer
37 1  mujer
38 1  mujer
39 1 hombre
40 1 hombre

barplot(table(m), beside=TRUE, legend=TRUE)

barplot(table(y, w), beside=TRUE, legend=TRUE)

4 License of This Work

This work is licensed under the Creative Commons Attribution-NoDerivatives 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nd/4.0/.

R Practice 2 by Gimenez Christian is licensed under a Creative Commons Attribution-NoDerivatives 4.0 International License.