Minera˘c~ao de Dados: o que fazer?albertini/1sem2017/md/aulas/02preproc.pdf · 2017-04-13 ·...

Preview:

Citation preview

Mineracao de Dados: o que fazer?

1. Decidir o que voce quer fazer: tarefas de mineracao

2. Descrever as tarefas em forma de um programa de computador

3. Executar o programa e apresentar o resultado

Packages para analise de dados: pre-processamento

I Visualizacao de dadosI ggplot2, googleVis, rworldmap

I Manipulacao de dadosI dplyr, data.table

I Imputacao de dados faltantesI MissForest, MissMDA, Mice, MI, Imputation

I Deteccao de outliersI outliers, robust, psych

I Selecao de caracterısticasI fscaret, RRF

I Reducao de dimensionalidadeI FactoMineR, dimRed

I Chickwts: estudo sobre crescimento de frango de acordo com osuplemento alimentar

data(chickwts)

plot(chickwts$feed)

casein horsebean linseed meatmeal soybean sunflower

02

46

810

1214

data(chickwts)

feeds <- table(chickwts$feed)# obter frequencias

feeds

##

## casein horsebean linseed meatmeal soybean

## 12 10 12 11 14

## sunflower

## 12

barplot(feeds)

casein horsebean linseed meatmeal soybean sunflower

02

46

810

1214

barplot(feeds[order(feeds, decreasing=TRUE)])

soybean casein linseed sunflower meatmeal horsebean

02

46

810

1214

barplot(feeds[order(feeds)], horiz=TRUE, las=1,

col=topo.colors(length(feeds)), border=NA,

main="Animais alimentados \npor suplemento (chickwts)",

xlab="Numero de frangos")

horsebean

meatmeal

casein

linseed

sunflower

soybean

Animais alimentados por suplemento (chickwts)

Número de frangos

0 2 4 6 8 10 12 14

feeds <- table(chickwts$feed)

pie(feeds)

casein

horsebean

linseed

meatmeal

soybean

sunflower

pie(feeds[order(feeds, decreasing=TRUE)],

init.angle=90,

clockwise=TRUE,

col = c("seashell", "cadetblue2", "lightpink",

"lightcyan", "plum1", "papayawhip"),

main = "Uso de cada suplemento (chickwts)")

soybean

casein

linseed

sunflower

meatmeal

horsebean

Uso de cada suplemento (chickwts)

data(lynx) # captura de felinos Lynx 1821 - 1934

hist(lynx)

Histogram of lynx

lynx

Fre

quen

cy

0 1000 2000 3000 4000 5000 6000 7000

010

2030

4050

60

data(lynx) # captura de felinos Lynx 1821 - 1934

h <- hist(lynx, breaks=11, freq=FALSE,

col = "thistle1",

main = "Captura anual de linces no Canada\n1821 - 1934",

xlab = "Numero de linces capturados")

curve(dnorm(x, mean=mean(lynx), sd=sd(lynx)),

col = "thistle4",

lwd = 2,

add = TRUE)

Captura anual de linces no Canadá1821 − 1934

Número de linces capturados

Den

sity

0 1000 2000 3000 4000 5000 6000 7000

0e+

002e

−04

4e−

046e

−04

data(USJudgeRatings) # avaliac~oes de juızes

boxplot(USJudgeRatings$RTEN)

●●

56

78

9

data(USJudgeRatings) # avaliacoes de juizes

boxplot(USJudgeRatings,

horizontal = TRUE, las=1, notch = TRUE,

col = "slategray3", boxwex = 0.5, whisklty=1,

outpch=16, outcol="slategray3", stoplelty = 0,

ylim = c(0,10), xlab = "Avaliac~oes")

●●

●●

●●

● ●● ●

● ● ●●

CONT

INTG

DMNR

DILG

CFMG

DECI

PREP

FAMI

ORAL

WRIT

PHYS

RTEN

0 2 4 6 8 10

Avaliações

data(swiss) # 1888 sobre fertilidade e economia suıca

fertility <- swiss$Fertility

hist(fertility, prob = TRUE, ylim = c(0, 0.04),

xlim = c(30, 100), breaks =11, col = "gray",

main = "Fertilidade nas 47 provincias suıcas")

Fertilidade nas 47 provincias suíças

fertility

Den

sity

30 40 50 60 70 80 90 100

0.00

0.01

0.02

0.03

0.04

curve(dnorm(x, mean=mean(fertility), sd =sd(fertility)),

col="red", lwd=3, add=TRUE)

lines(density(fertility), col="blue")

lines(density(fertility, adjust=3), col="darkgreen")

rug(fertility, col="red") # plot de linhas sob hist

Fertilidade nas 47 províncias suíças

fertility

Den

sity

30 40 50 60 70 80 90 100

0.00

0.01

0.02

0.03

0.04

data(iris) ; pl = iris$Petal.Length

hist(pl, prob = TRUE, col = "gray",

main = "Comprimento de petalas de flores Iris")

curve(dnorm(x, mean=mean(pl), sd =sd(pl)),

col="red", lwd=3, add=TRUE)

lines(density(pl), lwd=4,col="darkgreen")

rug(pl, col="red") # plot de linhas sob hist

Comprimento de pétalas de flores Iris

pl

Den

sity

1 2 3 4 5 6 7

0.0

0.1

0.2

0.3

0.4

0.5

Visualizacao de dados: ggplot2

data(mpg) # hwy: mpg consumo de carros, drv: tipo

qplot(displ, hwy, data = mpg, color=drv)

●●

●●

●● ●●

● ●

● ●

●●

●●

● ●

●●

●●

● ●

●●

● ●

● ●

●●●

●●

●●

●●

●●

●●●

● ●

● ●

●●

● ●

●●

● ●

●●

●● ●●

●●

●●

●●

●●

●●

●●

●●

●● ●●

●●●

●● ●

20

30

40

2 3 4 5 6 7

displ

hwy

drv●

4

f

r

Visualizacao de dados: ggplot2

qplot(displ, hwy, data = mpg,

geom=c('point', #mantem os pontos

'smooth'))#tendencia dos pontos - area cinza

●●

●●

●● ●●

● ●

● ●

●●

●●

● ●

●●

●●

● ●

●●

● ●

● ●

●●●

●●

●●

●●

●●

●●●

● ●

● ●

●●

● ●

●●

● ●

●●

●● ●●

●●

●●

●●

●●

●●

●●

●●

●● ●●

●●●

●● ●

20

30

40

2 3 4 5 6 7

displ

hwy

Visualizacao de dados: ggplot2

qplot(hwy, data = mpg, fill=drv)

0

10

20

30

40

10 20 30 40

hwy

coun

t

drv

4

f

r

Visualizacao de dados: ggplot2

qplot(displ, data = mpg, facets=.~drv, col=cyl)

4 f r

2 4 6 2 4 6 2 4 6

0

5

10

15

20

displ

coun

t

Visualizacao de dados: ggplot2

qplot(hwy, data = mpg, facets=drv~.,binwidth=2)

4f

r

10 20 30 40

0

10

20

30

0

10

20

30

0

10

20

30

hwy

coun

t

qplot(Sepal.Length, Petal.Length, data = iris, col=Species,

size = Petal.Width, alpha = I(0.7),

xlab = 'Sepal Length', ylab = 'Petal Length',

main = "Sepal vs. Petal Length in Fisher's Iris data")

2

4

6

5 6 7 8

Sepal Length

Pet

al L

engt

h

Petal.Width

0.5

1.0

1.5

2.0

2.5

Species

setosa

versicolor

virginica

Sepal vs. Petal Length in Fisher's Iris data

qplot(age, circumference, data = Orange,

geom = c('point','line'), size=circumference,

colour = Tree,xlab='Idade',ylab='Circunferencia',

main = 'Circunferencia da laranjeira vs. idade')

● ●

● ●

●●

● ●

●●

● ●

●●

● ●

● ●

50

100

150

200

400 800 1200 1600

Idade

Circ

unfe

rênc

ia

circumference

●●

50

100

150

200

Tree●

3

1

5

2

4

Circunferência da laranjeira vs. idade

#displ: engine displacement (L)

# hwy: highway miles per gallon

ggplot(mpg, aes(displ, hwy, colour = class)) +

geom_point() +

geom_smooth(se = FALSE, method = 'lm')

●●

●●

●●●

●●

●●

●● ●●●

● ●

●●

●●

● ●

●●

●●

●●

● ●

●●

●●

●●

● ●

●●

●●

●●●

●●

● ●●

● ●

●●●●●●

●●

●●●●

●●●

●●

●●

●●

●●

●●●

●●●

●●

●●

●●

● ●

●●

●●

● ●

●●

●●●

● ●

●●

● ●●●

●●●

●●

●● ●●●

●●●

●●

●●

●●

●●

●●●

●●

●●

●●

●● ●●

●●

●●●●

●●

●● ●

20

30

40

2 3 4 5 6 7

displ

hwy

class●

2seater

compact

midsize

minivan

pickup

subcompact

suv

Visualizacao de dados: googleVis

install.packages(googleVis)

demo(googleVis)

Visualizacao de dados: mapas com rworldmap

plot(getMap())

points(airports$lon, airports$lat,col='red',pch=17,cex=.1)

lim = airports[airports$IATA_FAA=="LIM",c('lon','lat')]

udi = airports[airports$IATA_FAA=="UDI",c('lon','lat')]

lines(c(lim$lon, udi$lon),c(lim$lat, udi$lat), col='blue')

Manipulacao de dados: dplyr

## require(dplyr)

data(mtcars); mtcars = tbl_df(mtcars)

mtcars %>% group_by(cyl, am) %>%

select(mpg, cyl, wt, am) %>%

summarise(avgmpg = mean(mpg), avgwt = mean(wt)) %>%

filter(avgmpg > 20)

## Source: local data frame [3 x 4]

## Groups: cyl [2]

##

## cyl am avgmpg avgwt

## <dbl> <dbl> <dbl> <dbl>

## 1 4 0 22.90000 2.93500

## 2 4 1 28.07500 2.04225

## 3 6 1 20.56667 2.75500

Manipulacao de dados: dplyr

require(nycflights13)#voos partindo de NY em 2013

data(flights)

glimpse(flights)

## Observations: 336,776

## Variables: 19

## $ year <int> 2013, 2013, 2013, 2013, 201...

## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...

## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...

## $ dep_time <int> 517, 533, 542, 544, 554, 55...

## $ sched_dep_time <int> 515, 529, 540, 545, 600, 55...

## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3...

## $ arr_time <int> 830, 850, 923, 1004, 812, 7...

## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 7...

## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 1...

## $ carrier <chr> "UA", "UA", "AA", "B6", "DL...

## $ flight <int> 1545, 1714, 1141, 725, 461,...

## $ tailnum <chr> "N14228", "N24211", "N619AA...

## $ origin <chr> "EWR", "LGA", "JFK", "JFK",...

## $ dest <chr> "IAH", "IAH", "MIA", "BQN",...

## $ air_time <dbl> 227, 227, 160, 183, 116, 15...

## $ distance <dbl> 1400, 1416, 1089, 1576, 762...

## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, ...

## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0...

## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-...

Manipulacao de dados: dplyr

head(flights)

## # A tibble: 6 × 19

## year month day dep_time sched_dep_time dep_delay

## <int> <int> <int> <int> <int> <dbl>

## 1 2013 1 1 517 515 2

## 2 2013 1 1 533 529 4

## 3 2013 1 1 542 540 2

## 4 2013 1 1 544 545 -1

## 5 2013 1 1 554 600 -6

## 6 2013 1 1 554 558 -4

## # ... with 13 more variables: arr_time <int>,

## # sched_arr_time <int>, arr_delay <dbl>,

## # carrier <chr>, flight <int>, tailnum <chr>,

## # origin <chr>, dest <chr>, air_time <dbl>,

## # distance <dbl>, hour <dbl>, minute <dbl>,

## # time_hour <dttm>

Manipulacao de dados: dplyr

I Verbos: operadores de dadosI filter() e slice()I arrange()I select() e rename()I distinct()I mutate() e transmute()I summarise()I sample n() e sample frac()

Manipulacao de dados: dplyrI filter(): seleciona linhas

# flights[flights$month == 1 & flights$day == 1, ]

filter(flights, month == 1, day == 1)

## # A tibble: 842 × 19

## year month day dep_time sched_dep_time dep_delay

## <int> <int> <int> <int> <int> <dbl>

## 1 2013 1 1 517 515 2

## 2 2013 1 1 533 529 4

## 3 2013 1 1 542 540 2

## 4 2013 1 1 544 545 -1

## 5 2013 1 1 554 600 -6

## 6 2013 1 1 554 558 -4

## 7 2013 1 1 555 600 -5

## 8 2013 1 1 557 600 -3

## 9 2013 1 1 557 600 -3

## 10 2013 1 1 558 600 -2

## # ... with 832 more rows, and 13 more variables:

## # arr_time <int>, sched_arr_time <int>,

## # arr_delay <dbl>, carrier <chr>, flight <int>,

## # tailnum <chr>, origin <chr>, dest <chr>,

## # air_time <dbl>, distance <dbl>, hour <dbl>,

## # minute <dbl>, time_hour <dttm>

Manipulacao de dados: dplyrI filter(): seleciona linhas de acordo com valores

# voos de janeiro OU | de fevereiro

filter(flights, month == 1 | month == 2)

## # A tibble: 51,955 × 19

## year month day dep_time sched_dep_time dep_delay

## <int> <int> <int> <int> <int> <dbl>

## 1 2013 1 1 517 515 2

## 2 2013 1 1 533 529 4

## 3 2013 1 1 542 540 2

## 4 2013 1 1 544 545 -1

## 5 2013 1 1 554 600 -6

## 6 2013 1 1 554 558 -4

## 7 2013 1 1 555 600 -5

## 8 2013 1 1 557 600 -3

## 9 2013 1 1 557 600 -3

## 10 2013 1 1 558 600 -2

## # ... with 51,945 more rows, and 13 more variables:

## # arr_time <int>, sched_arr_time <int>,

## # arr_delay <dbl>, carrier <chr>, flight <int>,

## # tailnum <chr>, origin <chr>, dest <chr>,

## # air_time <dbl>, distance <dbl>, hour <dbl>,

## # minute <dbl>, time_hour <dttm>

Manipulacao de dados: dplyr

# selecionar linhas que origem contem 'A'

glimpse(filter(flights, grepl('A', origin)))

## Observations: 104,662

## Variables: 19

## $ year <int> 2013, 2013, 2013, 2013, 201...

## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...

## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...

## $ dep_time <int> 533, 554, 557, 558, 559, 60...

## $ sched_dep_time <int> 529, 600, 600, 600, 600, 60...

## $ dep_delay <dbl> 4, -6, -3, -2, -1, 0, 0, -8...

## $ arr_time <int> 850, 812, 709, 753, 941, 85...

## $ sched_arr_time <int> 830, 837, 723, 745, 910, 85...

## $ arr_delay <dbl> 20, -25, -14, 8, 31, -7, 12...

## $ carrier <chr> "UA", "DL", "EV", "AA", "AA...

## $ flight <int> 1714, 461, 5708, 301, 707, ...

## $ tailnum <chr> "N24211", "N668DN", "N829AS...

## $ origin <chr> "LGA", "LGA", "LGA", "LGA",...

## $ dest <chr> "IAH", "ATL", "IAD", "ORD",...

## $ air_time <dbl> 227, 116, 53, 138, 257, 152...

## $ distance <dbl> 1416, 762, 229, 733, 1389, ...

## $ hour <dbl> 5, 6, 6, 6, 6, 6, 6, 6, 6, ...

## $ minute <dbl> 29, 0, 0, 0, 0, 0, 0, 10, 5...

## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-...

Manipulacao de dados: dplyr

# selecionar linhas que origem contem 'A'

flights %>% select(carrier:dest) %>%

filter(grepl('A', origin)) %>%

glimpse()

## Observations: 104,662

## Variables: 5

## $ carrier <chr> "UA", "DL", "EV", "AA", "AA", "B6"...

## $ flight <int> 1714, 461, 5708, 301, 707, 371, 46...

## $ tailnum <chr> "N24211", "N668DN", "N829AS", "N3A...

## $ origin <chr> "LGA", "LGA", "LGA", "LGA", "LGA",...

## $ dest <chr> "IAH", "ATL", "IAD", "ORD", "DFW",...

Manipulacao de dados: dplyrI slice(): seleciona linhas por posicao

# flights[1:7, ]

slice(flights, 1:7)

## # A tibble: 7 × 19

## year month day dep_time sched_dep_time dep_delay

## <int> <int> <int> <int> <int> <dbl>

## 1 2013 1 1 517 515 2

## 2 2013 1 1 533 529 4

## 3 2013 1 1 542 540 2

## 4 2013 1 1 544 545 -1

## 5 2013 1 1 554 600 -6

## 6 2013 1 1 554 558 -4

## 7 2013 1 1 555 600 -5

## # ... with 13 more variables: arr_time <int>,

## # sched_arr_time <int>, arr_delay <dbl>,

## # carrier <chr>, flight <int>, tailnum <chr>,

## # origin <chr>, dest <chr>, air_time <dbl>,

## # distance <dbl>, hour <dbl>, minute <dbl>,

## # time_hour <dttm>

Manipulacao de dados: dplyrI arrange(): reordena linhas por campo

arrange(flights, year, month, day)

## # A tibble: 336,776 × 19

## year month day dep_time sched_dep_time dep_delay

## <int> <int> <int> <int> <int> <dbl>

## 1 2013 1 1 517 515 2

## 2 2013 1 1 533 529 4

## 3 2013 1 1 542 540 2

## 4 2013 1 1 544 545 -1

## 5 2013 1 1 554 600 -6

## 6 2013 1 1 554 558 -4

## 7 2013 1 1 555 600 -5

## 8 2013 1 1 557 600 -3

## 9 2013 1 1 557 600 -3

## 10 2013 1 1 558 600 -2

## # ... with 336,766 more rows, and 13 more variables:

## # arr_time <int>, sched_arr_time <int>,

## # arr_delay <dbl>, carrier <chr>, flight <int>,

## # tailnum <chr>, origin <chr>, dest <chr>,

## # air_time <dbl>, distance <dbl>, hour <dbl>,

## # minute <dbl>, time_hour <dttm>

Manipulacao de dados: dplyrI arrange(): reordena linhas por campo

# ordena decrescente de acordo com atraso de chegada

#flights[order(flights$arr_delay, decreasing = TRUE), ]

arrange(flights, desc(arr_delay))

## # A tibble: 336,776 × 19

## year month day dep_time sched_dep_time dep_delay

## <int> <int> <int> <int> <int> <dbl>

## 1 2013 1 9 641 900 1301

## 2 2013 6 15 1432 1935 1137

## 3 2013 1 10 1121 1635 1126

## 4 2013 9 20 1139 1845 1014

## 5 2013 7 22 845 1600 1005

## 6 2013 4 10 1100 1900 960

## 7 2013 3 17 2321 810 911

## 8 2013 7 22 2257 759 898

## 9 2013 12 5 756 1700 896

## 10 2013 5 3 1133 2055 878

## # ... with 336,766 more rows, and 13 more variables:

## # arr_time <int>, sched_arr_time <int>,

## # arr_delay <dbl>, carrier <chr>, flight <int>,

## # tailnum <chr>, origin <chr>, dest <chr>,

## # air_time <dbl>, distance <dbl>, hour <dbl>,

## # minute <dbl>, time_hour <dttm>

Manipulacao de dados: dplyrI select(): seleciona colunas por nomes

select(flights, month, day, arr_delay)

## # A tibble: 336,776 × 3

## month day arr_delay

## <int> <int> <dbl>

## 1 1 1 11

## 2 1 1 20

## 3 1 1 33

## 4 1 1 -18

## 5 1 1 -25

## 6 1 1 12

## 7 1 1 19

## 8 1 1 -14

## 9 1 1 -8

## 10 1 1 8

## # ... with 336,766 more rows

Manipulacao de dados: dplyr

arrange(select(flights, month, day, arr_delay),

desc(arr_delay))

## # A tibble: 336,776 × 3

## month day arr_delay

## <int> <int> <dbl>

## 1 1 9 1272

## 2 6 15 1127

## 3 1 10 1109

## 4 9 20 1007

## 5 7 22 989

## 6 4 10 931

## 7 3 17 915

## 8 7 22 895

## 9 12 5 878

## 10 5 3 875

## # ... with 336,766 more rows

Manipulacao de dados: dplyr

# usar somente as coluna de year ate day (ambas inclusas)

select(flights, year:day)

## # A tibble: 336,776 × 3

## year month day

## <int> <int> <int>

## 1 2013 1 1

## 2 2013 1 1

## 3 2013 1 1

## 4 2013 1 1

## 5 2013 1 1

## 6 2013 1 1

## 7 2013 1 1

## 8 2013 1 1

## 9 2013 1 1

## 10 2013 1 1

## # ... with 336,766 more rows

Manipulacao de dados: dplyr

# usar todas as colunas EXCETO de year ate day

select(flights, -(year:day))

## # A tibble: 336,776 × 16

## dep_time sched_dep_time dep_delay arr_time

## <int> <int> <dbl> <int>

## 1 517 515 2 830

## 2 533 529 4 850

## 3 542 540 2 923

## 4 544 545 -1 1004

## 5 554 600 -6 812

## 6 554 558 -4 740

## 7 555 600 -5 913

## 8 557 600 -3 709

## 9 557 600 -3 838

## 10 558 600 -2 753

## # ... with 336,766 more rows, and 12 more variables:

## # sched_arr_time <int>, arr_delay <dbl>,

## # carrier <chr>, flight <int>, tailnum <chr>,

## # origin <chr>, dest <chr>, air_time <dbl>,

## # distance <dbl>, hour <dbl>, minute <dbl>,

## # time_hour <dttm>

Manipulacao de dados: dplyr

# renomear coluna, mas fica somente com ela

select(flights, chassiNum = tailnum)

## # A tibble: 336,776 × 1

## chassiNum

## <chr>

## 1 N14228

## 2 N24211

## 3 N619AA

## 4 N804JB

## 5 N668DN

## 6 N39463

## 7 N516JB

## 8 N829AS

## 9 N593JB

## 10 N3ALAA

## # ... with 336,766 more rows

Manipulacao de dados: dplyr

# renomear coluna E manter outras

rename(flights, chassiNum = tailnum)

## # A tibble: 336,776 × 19

## year month day dep_time sched_dep_time dep_delay

## <int> <int> <int> <int> <int> <dbl>

## 1 2013 1 1 517 515 2

## 2 2013 1 1 533 529 4

## 3 2013 1 1 542 540 2

## 4 2013 1 1 544 545 -1

## 5 2013 1 1 554 600 -6

## 6 2013 1 1 554 558 -4

## 7 2013 1 1 555 600 -5

## 8 2013 1 1 557 600 -3

## 9 2013 1 1 557 600 -3

## 10 2013 1 1 558 600 -2

## # ... with 336,766 more rows, and 13 more variables:

## # arr_time <int>, sched_arr_time <int>,

## # arr_delay <dbl>, carrier <chr>, flight <int>,

## # chassiNum <chr>, origin <chr>, dest <chr>,

## # air_time <dbl>, distance <dbl>, hour <dbl>,

## # minute <dbl>, time_hour <dttm>

Manipulacao de dados: dplyr

distinct(flights, tailnum)

## # A tibble: 4,044 × 1

## tailnum

## <chr>

## 1 N14228

## 2 N24211

## 3 N619AA

## 4 N804JB

## 5 N668DN

## 6 N39463

## 7 N516JB

## 8 N829AS

## 9 N593JB

## 10 N3ALAA

## # ... with 4,034 more rows

Manipulacao de dados: dplyr

distinct(flights, origin, dest)

## # A tibble: 224 × 2

## origin dest

## <chr> <chr>

## 1 EWR IAH

## 2 LGA IAH

## 3 JFK MIA

## 4 JFK BQN

## 5 LGA ATL

## 6 EWR ORD

## 7 EWR FLL

## 8 LGA IAD

## 9 JFK MCO

## 10 LGA ORD

## # ... with 214 more rows

Manipulacao de dados: dplyrI mutate(): adiciona coluna (recodificacao de variaveis)

mutate(flights, # gain e speed s~ao recodificac~oes

gain = arr_delay - dep_delay,

speed = distance / air_time * 60) %>%

select(tailnum, gain, speed)

## # A tibble: 336,776 × 3

## tailnum gain speed

## <chr> <dbl> <dbl>

## 1 N14228 9 370.0441

## 2 N24211 16 374.2731

## 3 N619AA 31 408.3750

## 4 N804JB -17 516.7213

## 5 N668DN -19 394.1379

## 6 N39463 16 287.6000

## 7 N516JB 24 404.4304

## 8 N829AS -11 259.2453

## 9 N593JB -5 404.5714

## 10 N3ALAA 10 318.6957

## # ... with 336,766 more rows

Manipulacao de dados: dplyr

# resumir tabela de acordo com origem do voo

flights %>%

group_by(origin) %>%

summarize(n=n())

## # A tibble: 3 × 2

## origin n

## <chr> <int>

## 1 EWR 120835

## 2 JFK 111279

## 3 LGA 104662

Manipulacao de dados: dplyr

# resumir tabela de acordo com origem e destino do voo

flights %>%

group_by(origin,dest) %>%

summarize(n=n()) %>%

arrange(desc(n))

## Source: local data frame [224 x 3]

## Groups: origin [3]

##

## origin dest n

## <chr> <chr> <int>

## 1 JFK LAX 11262

## 2 LGA ATL 10263

## 3 LGA ORD 8857

## 4 JFK SFO 8204

## 5 LGA CLT 6168

## 6 EWR ORD 6100

## 7 JFK BOS 5898

## 8 LGA MIA 5781

## 9 JFK MCO 5464

## 10 EWR BOS 5327

## # ... with 214 more rows

Manipulacao de dados: dplyr

# Organizar dados para obter atraso medio

# por distancia viajada

by_tailnum <- group_by(flights, tailnum)

delay <- summarise(by_tailnum, count = n(),

dist = mean(distance, na.rm = TRUE),

delay = mean(arr_delay, na.rm = TRUE))

delay <- filter(delay, count > 20, dist < 2000)

Manipulacao de dados: dplyr

ggplot(delay, aes(dist, delay)) +

geom_point(aes(size = count), alpha = 1/2) +

geom_smooth() + scale_size_area()

−20

0

20

40

60

500 1000 1500 2000

dist

dela

y

count

500

1000

1500

2000

2500

Manipulacao de dados: dplyr

I Conexao a bancos de dados usando dplyr

I Mais informacoes em [link]

my_db <- src_sqlite('my_db.sqlite3')

flights_tbl <- tbl(my_db, 'hflights') # nome da table

## possivel usar qualquer comando dplyr em flights_tbl

## possivel usar SQL

tbl(my_db, sql('SELECT * FROM hflights LIMIT 100'))

flights %>% group_by(origin, dest) %>% explain()

Manipulacao de dados: data.tableI Package data.table e util para grandes datasets

require(data.table)

require(hflights)

DT <- as.data.table(hflights)# voos de Houston, 2011

DT[Month==10,mean(na.omit(AirTime)), by=UniqueCarrier]

## UniqueCarrier V1

## 1: AA 68.76471

## 2: AS 255.29032

## 3: B6 176.93548

## 4: CO 141.52861

## 5: DL 92.76824

## 6: WN 87.14947

## 7: XE 82.44422

## 8: OO 114.98865

## 9: UA 166.18354

## 10: US 137.46078

## 11: EV 113.12273

## 12: F9 126.55357

## 13: FL 90.85561

## 14: MQ 100.13054

Imputacao de dados faltantes

I PacotesI Missing Data Imputation and Model Checking: miI Imputation: imputationI Multivariate Imputation by Chained Equations: miceI ... varios outros

I Dados pode faltar ...I ... aleatoriamenteI ... por alguma razao nao-obvia

Porque imputacao? Problemas com dados faltantes

require(Hmisc)

x1 <- c(1,2,3,NA,5)

describe(x1)

## x1

## n missing distinct Info Mean Gmd

## 4 1 4 1 2.75 2.167

##

## Value 1 2 3 5

## Frequency 1 1 1 1

## Proportion 0.25 0.25 0.25 0.25

mean(x1) # PROBLEMA!

## [1] NA

Evitando dados faltantes

x1 <- c(1,2,3,NA,5)

mean(x1, na.rm = T)

## [1] 2.75

x2 <- x1[is.na(x1)] # mas... reduz numero de observacoes

x3 <- x1

x3[is.na(x2)] <- 0 # mas... pode ser tambem a media

x4 <- ifelse(is.na(x1), 0 ,x1)

## Ou USAR pacotes de IMPUTAC~AO

Imputacao de dados faltantes: mice

require(mice)

ar <- airquality

ar[4:10,3] <- rep(NA,7)

ar[1:5,4] <- NA

md.pattern(ar)#visualizar NA's nos dados

## Month Day Temp Solar.R Wind Ozone

## 104 1 1 1 1 1 1 0

## 34 1 1 1 1 1 0 1

## 4 1 1 1 0 1 1 1

## 3 1 1 1 1 0 1 1

## 3 1 1 0 1 1 1 1

## 1 1 1 1 0 1 0 2

## 1 1 1 1 1 0 0 2

## 1 1 1 1 0 0 1 2

## 1 1 1 0 1 0 1 2

## 1 1 1 0 0 0 0 4

## 0 0 5 7 7 37 56

Imputacao de dados faltantes: transformacao de variaveis

I Predictive mean matchingI statisticalhorizons.com/predictive-mean-matching

## require(mice)

#method: mean=medias, pmm=predictive mean matching

ar_imp <- mice(ar,m=5,maxit=50,

method='pmm',seed=500, printFlag=F)

ar <- complete(ar_imp)

md.pattern(ar)

## Ozone Solar.R Wind Temp Month Day

## [1,] 1 1 1 1 1 1 0

## [2,] 0 0 0 0 0 0 0

Imputacao de dados faltantes: dados multivariadosI Observacoes mais proximas: k-vizinhos mais proximos

## require(VIM) # para usar kNN()

ar <- airquality

ar[4:10,3] <- rep(NA,7)

ar[1:5,4] <- NA

glimpse(kNN(ar,var=c('Ozone','Solar.R','Wind'),k=5))

## Observations: 153

## Variables: 9

## $ Ozone <int> 41, 36, 12, 18, 23, 28, 23, 19...

## $ Solar.R <int> 190, 118, 149, 313, 194, 194, ...

## $ Wind <dbl> 7.4, 8.0, 12.6, 8.6, 8.6, 9.7,...

## $ Temp <int> NA, NA, NA, NA, NA, 66, 65, 59...

## $ Month <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...

## $ Day <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,...

## $ Ozone_imp <lgl> FALSE, FALSE, FALSE, FALSE, TR...

## $ Solar.R_imp <lgl> FALSE, FALSE, FALSE, FALSE, TR...

## $ Wind_imp <lgl> FALSE, FALSE, FALSE, TRUE, TRU...

Deteccao de outliers

Definicao de outlier (dados espurios)

“Um outlier e uma observacao tao destoante de outras que torna-sesuspeita de ter sido gerada de uma forma distinta”(Hawkins, 1980)

I Outliers refletem:I erros de afericaoI falhas de execucaoI variabilidade intrınsica

Exemplos de outliers

I Caso Hadlum vs. Hadlum (1948): caso de divorcio por causa deuma gravidez de 50 semanas

I Temperaturas: 21h: 21C, 22h: 22C, 23h: 23C, 24h: 38C, 01h:38C,02h: 21C

I Registro de ganho de peso: 1.2kg, 1.6kg, 1.9kg, 1.55kg, 2.2kg,2.25kg

I 10 dados foram lancados e o numero de ’6’, registrado: 2, 0, 3, 12,2,0 ,1 ,1,3

Tratamento de outliers

I “Outliers, para uns, sao indesejaveis, para outros a fonte dainformacao mais importante”

I The Study of Outliers: Purpose and Model, Barnett (1978):tratamento

1. Identificacao (Deteccao)2. Incorporacao – mudanca do modelo do fenomeno3. Rejeicao4. Acomodacao

Identificacao de outliers

data(rivers)

boxplot(rivers, horizontal=TRUE, col='slategray')

rug(rivers, col='red',ticksize=.1, lwd=.5)

●●● ● ●● ●●● ●●

0 500 1000 1500 2000 2500 3000 3500

Rejeicao de outliers

rivers.limpo <- rivers

outliers <- NA

while (length(outliers) != 0) {outliers <- boxplot.stats(rivers.limpo)$out

manter <- !(rivers.limpo %in% outliers)

rivers.limpo <- rivers.limpo[manter]

}boxplot(rivers.limpo, horizontal=TRUE, col='slategray')

rug(rivers, col='red',ticksize=.1, lwd=.5)

200 400 600 800 1000

Identificacao de outliers: pacote outliersI Funcoes:

I Busca por valor com maior diferenca a media: outlier()I Testa e remove outlier encontrado: rm.outlier()

require(outliers)

set.seed(1234); y = rnorm(8); y

## [1] -1.2070657 0.2774292 1.0844412 -2.3456977

## [5] 0.4291247 0.5060559 -0.5747400 -0.5466319

outlier(y)

## [1] -2.345698

rm.outlier(y)

## [1] -1.2070657 0.2774292 1.0844412 0.4291247

## [5] 0.5060559 -0.5747400 -0.5466319

Acomodacao de outliers: estatısticas robustasI Outliers sao dados com erros ou extremosI Outliers afetam medidas-resumo

area <- state.area

summary(area)

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## 1214 37320 56220 72370 83230 589800

mean(area)

## [1] 72367.98

median(area)

## [1] 56222

mean(area, trim=0.05) #desconsidera 10% de outliers

## [1] 59957.22

Acomodacao de outliers: pacote robust – Estatısticasrobustas

## require(robust)

sd(area) #desvio padr~ao

## [1] 88278.01

mad(area) #desvio absoluto mediano

## [1] 35711.39

IQR(area) #intervalo interquartile

## [1] 45916.75

fivenum(area) #boxplot

## [1] 1214 36291 56222 83557 589757

Outliers: transformacao de variaveis

data(islands)

# problema: tudo e outlier?

boxplot(islands, horizontal=TRUE)

●● ●● ●● ●●

0 5000 10000 15000

Outliers: transformacao de variaveis

islands.z <- scale(islands)

attr(islands.z, 'scaled:center') # media original

## [1] 1252.729

attr(islands.z, 'scaled:scale') # desv. pad. original

## [1] 3371.146

summary(islands.z)

## V1

## Min. :-0.3680

## 1st Qu.:-0.3655

## Median :-0.3594

## Mean : 0.0000

## 3rd Qu.:-0.3172

## Max. : 4.6676

Outliers: transformacao de variaveis

islands.ln <- log(islands)

summary(islands.ln)

## Min. 1st Qu. Median Mean 3rd Qu. Max.

## 2.485 3.020 3.713 4.446 5.211 9.740

boxplot(islands.ln, horizontal=TRUE)

rug(islands.ln, col='red',ticksize=.1, lwd=.5)

●● ●●●

4 6 8 10

Outliers: transformacao de variaveis

continentes <- ifelse(islands > 1000, TRUE, FALSE)

# valores sao transformados em ordem

islands[continentes]

## Africa Antarctica Asia

## 11506 5500 16988

## Australia Europe North America

## 2968 3745 9390

## South America

## 6795

Outliers: winsorizing

I Metodo de substituicao de valores extremos por percentis altos (0.2e o padrao)

require(psych)

data <- c(sample(x=1:10, size=20, replace=TRUE), 13,17)

data

## [1] 3 3 2 3 4 4 2 1 3 9 6 10 9 1 5 3 4

## [18] 6 2 8 13 17

winsor(data,trim=0.1)

## [1] 3.0 3.0 2.0 3.0 4.0 4.0 2.0 2.0 3.0 9.0 6.0 9.9

## [13] 9.0 2.0 5.0 3.0 4.0 6.0 2.0 8.0 9.9 9.9

Selecao de caracterısticas

I Principais abordagens:I Eliminar caracterısticas redundantesI Eliminar caracterısticas com variancia zeroI Buscar pelo subconjunto mınimo de caracterısticas para obter o

melhor resultado de classificacaoI Busca pelo conjunto maximo que algum tipo de informacao relevante

Selecao de caracterısticas: correlacoes

featurePlot(x = iris[, 1:4], y = iris$Species, #caret

plot = "pairs", auto.key = list(columns = 3))

Scatter Plot Matrix

Sepal.Length78

7 8

56

5 6 ●●●●●

●●

●●

●●●

● ●●

●●

●●

●●● ●●●

●●

● ●●

●●●

●●

●●● ●

● ●●

●●

●●

●●

●●

●● ●●

●●●

●●●● ●

●●●●

●●●●●●

●●

●●●

●●

● ●●●

●●

●●

●●

●●●

● ●

●●

●●

●●

●●●

●●●

●●●

●●●

●●●

●●●● ●

●●

●●●●●●

●●

●●●

●●●

●●●●

●●●

●●●●●

●●●●

●●●

●●●

●●

●●●●

●●●●●

●●

●●

●●

●●●●

●●●

●●● ●●

●●●●

●●●●● ●

●●

●●●

●●

●●●●

●●

●●

●●

●●●

●●

●●

●●

●●●

●●●

●●●

●●●

●●●

●●●

●●●●●●●

●●●●●

●●

●●●

●●●

●●●

●●

●●

●●●●●●●●

●●●

●●●

●●

●●●●

●●●

●●

●●

●●

●●

●● ●●

●●●

●●●●●

●●● ●

●●●●

● ●

●●

●●●

●●

●●●●

●●

● ●

●●

●●●

● ●

●●

●●

●●

●●●

● ●●

●●●

●●●

● ●●

●●●●● ●

●●●

●●

●●

●●

●●

●●

●●●

●●●●●

●●●●●

●●

●●●●

●●

●●●

● ●● ●

●●

●●

●●●●

●●

●●

●●●●●●

●●●

●●●

●●

●●

●●

●●● ●

●●

●●●● ●

●●

●●

●●

● ●●

● ●

●●

●●●

●●●

●●● ●●●

●●●

● Sepal.Width3.54.04.5

3.5 4.5

2.02.53.0

2.0 3.0

●●●

●●

●●

●●

●●

●●

●●●

●●●●●

●●●●●●

●●

●●●●

●●

●●●

● ●●●

●●

●●

●●●●

●●

●●●●●●●●

●●●● ●

●●

●●

●●

●●●●

●●

●●●● ●

●●

●●

●●

● ●●

●●

●●

●●●

●●●

●●●●●●

●●●

●●●

●●

●●

●●

●●

●●

●●●

●●●●●

●●●●●

●●

●●●●

●●

●●

● ●●●

●●

●●

●●●●

●●

●●

●●●●

●●●●●● ●

●●

●●

●●

●●●●

●●

●●● ●●

●●

●●

●●

●●●

●●

●●

●●

●●●

●●● ● ●●

●●●

●●●●●●●●● ● ●●●● ●●●● ●●●●●

●●●●●●●● ●●●●● ●●● ●●●● ●●●●● ●●

●●●

●●● ●

●●

●●●●

●●●

●●

●●●●●●●

●●●

●●●●

●● ● ●●●●● ●

●●

●●● ●

●●

●●●●

●● ●

●●●

●● ●●

●●

●●

●● ●

●●● ●● ●●●

●●

●●●

●●●●●●●●●●●

●●●● ● ●●●●● ●●●● ● ●●● ●●● ●●

●●● ●●●●● ● ●●●● ●●● ●●● ● ● ●● ●● ●●

●●●

●●● ●

●●

●●●

●●●●

●●

●●

● ●●●

●●●

●●● ●

●● ●●● ●●

● ●●

●● ●●●

●●

●●●●

●● ●

●● ●● ● ●●

●●

●●

●●●

●●●●● ●●●

●●

●●●●●●●●●

●● ● ●●

Petal.Length4567

4 5 6 7

1234

1 2 3 4●●●●●

●●●●●●●●●●●●●

●●●●●

●●●●●●●●●●●●●●●●●●●●●●

●●●●●

●●●

●●● ●

●●

●●●

●●●●●

●●

●●●

●●● ●●

●●●●

●●●●●●●

●●●

●●●●●

●●

●● ●●

●● ●

●●●

● ●●●

●●

●●

●●●

●●●● ●●●

●●

●●●

●● ●

●●●●●●●●●

●●●●●●●●● ● ●●●● ●

●●● ●●●●

●●

●●●●●●●●

●●●● ●●● ●●●●

●●●●● ●●

●● ●●

●●

●●●

●● ●●

●●

●●●●●

●●●●

●● ● ●●●●●

●●

●●●● ●

●●

●●

● ●

● ●●

●●●●

● ●

●●

●● ●

●●

●●●●

●● ●

●●

●●

●●●

●●

●●●

●●●

●●●● ●●●●●● ●●●● ●

●●● ●●●●

●●●●●●●●●

●●●●● ●●● ●●● ●

●●● ●● ●●

●●●●●●

●●●

●●●●

●●

●●●●●

●●●●

● ● ●●● ●●●

●●

●● ●●●●●

●●

●●

● ●●

●●●●

● ●

●●

●●●

●●

●●●●

●● ●●

●●

● ●

●●●●●

●●

● ●●

●●●●●●●●●●●●●●●●●●●●●●

●●●●

●●●●●●●●●●●●●●●●●

●●●●●●●

●●●●

●●●

●●●

●● ●●

●●●●●●●

●●●●

●●●●●●●●●

●●

●●●●●●

●●

● ●

● ●●

●●●●

●●

●●

●● ●●

●●●●

●●●

●●

●●

●●●●●

●●

●●●

Petal.Width1.52.02.5

1.5 2.5

0.00.51.0

0.0 1.0

setosa versicolor virginica● ● ●

Selecao de caracterısticas: correlacoes

data(iris)

cor(iris[,1:4])

## Sepal.Length Sepal.Width Petal.Length

## Sepal.Length 1.0000000 -0.1175698 0.8717538

## Sepal.Width -0.1175698 1.0000000 -0.4284401

## Petal.Length 0.8717538 -0.4284401 1.0000000

## Petal.Width 0.8179411 -0.3661259 0.9628654

## Petal.Width

## Sepal.Length 0.8179411

## Sepal.Width -0.3661259

## Petal.Length 0.9628654

## Petal.Width 1.0000000

Selecao de caracterısticas: correlacoes

corr <- cor(iris[,1:4])

altaCor <- sum(abs(corr[upper.tri(corr)]) > .9)

altaCor

## [1] 1

Selecao de caracterısticas: correlacoes

require(caret)

corr <- cor(iris[,1:4])

irisSemAltaRedundancia <-

iris[,-findCorrelation(corr,cutoff=0.95)]

Selecao de caracterısticas: correlacoes

library(earth)

data(etitanic)

head(model.matrix(survived ~ ., data = etitanic))

## (Intercept) pclass2nd pclass3rd sexmale age

## 1 1 0 0 0 29.0000

## 2 1 0 0 1 0.9167

## 3 1 0 0 0 2.0000

## 4 1 0 0 1 30.0000

## 5 1 0 0 0 25.0000

## 6 1 0 0 1 48.0000

## sibsp parch

## 1 0 0

## 2 1 2

## 3 1 2

## 4 1 2

## 5 1 2

## 6 0 0

dummies <- dummyVars(survived ~ ., data = etitanic)

head(predict(dummies, newdata = etitanic))

## pclass.1st pclass.2nd pclass.3rd sex.female sex.male

## 1 1 0 0 1 0

## 2 1 0 0 0 1

## 3 1 0 0 1 0

## 4 1 0 0 0 1

## 5 1 0 0 1 0

## 6 1 0 0 0 1

## age sibsp parch

## 1 29.0000 0 0

## 2 0.9167 1 2

## 3 2.0000 1 2

## 4 30.0000 1 2

## 5 25.0000 1 2

## 6 48.0000 0 0

#https://www.youtube.com/watch?v=igPQ-pI8Bjo&list=WL&index=33 @ 4:12

Selecao de caracterısticas: combinacoes

dados <- matrix(0, nrow=6, ncol=5)

dados[,1] <- c(1, 1, 1, 1, 1, 1)

dados[,2] <- c(1, 1, 1, 0, 0, 0)

dados[,3] <- c(0, 0, 0, 1, 1, 1)

dados[,4] <- c(1, 0, 0, 1, 0, 0)

dados[,5] <- c(0, 0, 1, 0, 0, 1)

comb <- findLinearCombos(dados)

dados[,-comb$remove]

## [,1] [,2] [,3] [,4]

## [1,] 1 1 1 0

## [2,] 1 1 0 0

## [3,] 1 1 0 1

## [4,] 1 0 1 0

## [5,] 1 0 0 0

## [6,] 1 0 0 1

Reducao de dimensionalidade: pacote FactoMineR

require(FactoMineR);

data("decathlon")

res.pca <- PCA(decathlon, # PCA = Principal Component Analysis

quanti.sup = 11:12, # rank:pontos - QUANTItativas

quali.sup = 13)# categoricas

Reducao de dimensionalidade: PCA

plot(res.pca, choix="ind")

−10 −5 0 5 10

−4

−2

02

4

Individuals factor map (PCA)

Dim 1 (32.72%)

Dim

2 (

17.3

7%)

●●●

●● ●

●●

SEBRLECLAYKARPOV

BERNARD

YURKOV

WARNERSZSIVOCZKY

McMULLENMARTINEAUHERNU

BARRAS

NOOL

BOURGUIGNON

SebrleClay

Karpov

Macey

Warners

Zsivoczky

Hernu

Nool

Bernard

Schwarzl

Pogorelov

SchoenbeckBarras

Smith

AveryanovOjaniemiSmirnovQi

Drews

Parkhomenko

Terek

Gomez

Turi

Lorenzo

Karlivans

Korkizoglou

Uldal

Casarsa

DecastarOlympicG

Reducao de dimensionalidade: PCA

# habillage: as cores seguem a 13a variavel

plot(res.pca, choix="ind",habillage=13)

−10 −5 0 5 10

−4

−2

02

4

Individuals factor map (PCA)

Dim 1 (32.72%)

Dim

2 (

17.3

7%)

●●●

●● ●

●●

SEBRLECLAYKARPOV

BERNARD

YURKOV

WARNERSZSIVOCZKY

McMULLENMARTINEAUHERNUBARRAS

NOOL

BOURGUIGNON

SebrleClay

Karpov

Macey

Warners

Zsivoczky

HernuNool

Bernard

Schwarzl

Pogorelov

SchoenbeckBarras

Smith

AveryanovOjaniemiSmirnov

Qi

Drews

Parkhomenko

Terek

Gomez

Turi

Lorenzo

Karlivans

Korkizoglou

Uldal

Casarsa

DecastarOlympicG

DecastarOlympicG

Reducao de dimensionalidade: PCA

plot(res.pca, choix="var")

−2 −1 0 1 2

−1.

0−

0.5

0.0

0.5

1.0

Variables factor map (PCA)

Dim 1 (32.72%)

Dim

2 (

17.3

7%)

100m

Long.jump

Shot.put

High.jump

400m

110m.hurdleDiscus

Pole.vault

Javeline1500m

Rank Points

Reducao de dimensionalidade: PCA

barplot(res.pca$eig[,1], main = "Eigenvalues",

names.arg = paste("Dim", 1:nrow(res.pca$eig), sep = ""))

Dim1 Dim2 Dim3 Dim4 Dim5 Dim6 Dim7 Dim8 Dim9

Eigenvalues

0.0

1.0

2.0

3.0

Reducao de dimensionalidade: PCA

# usar variaveis 3 e 4

plot(res.pca, choix = "var", axes = c(3, 4),

lim.cos2.var = 0) # mostrar var so com qualidade > 0

−2 −1 0 1 2

−1.

0−

0.5

0.0

0.5

1.0

Variables factor map (PCA)

Dim 3 (14.05%)

Dim

4 (

10.5

7%)

100mLong.jumpShot.put

High.jump400m

110m.hurdle

Discus

Pole.vaultJaveline

1500mRank

Points

Reducao de dimensionalidade: pacote dimRed

iris.pca <- embed(loadDataSet("Iris"), "PCA")

quality(iris.pca); plot(iris.pca)

## 25

## 0.6526642

PC1

−1.0 −0.5 0.0 0.5 1.0

−3

02

4

● ●● ●●●

●● ●●● ● ●●

●● ● ●●

● ●●●

● ● ●●●● ●●●●● ●●● ● ●●● ●●●●

●● ●● ●

●●

●●

●●

● ●●

● ●●

●●

●●●●

●●●

● ●●●

●●●

●●

● ●●●

●●●●

●●

●●

● ●●

●●●●

● ●

●●

●●●●

●●

●● ●

● ●●

● ●●

●●●

●●●●

−3 −2 −1 0 1 2 3 4

−1.

00.

5

●●●

●●

●●

●●

● ●●● ●

●●●

●●

●●

●●

●●

●●

●●

●●

●●

●●●●

●●●

●● ●●

●●

●●

● ●●

●●

●●

● ●

●●●

●●

●●

●●

●●

●● ●

● ●

●●

●●●

●●

●●●

●●

PC2

Reducao de dimensionalidade: pacote dimRed

print(dimRedMethodList())

## [1] "DiffusionMaps" "DRR"

## [3] "FastICA" "KamadaKawai"

## [5] "DrL" "FruchtermanReingold"

## [7] "HLLE" "Isomap"

## [9] "kPCA" "LaplacianEigenmaps"

## [11] "LLE" "MDS"

## [13] "nMDS" "PCA"

## [15] "tSNE"

print(dimRedQualityList())

## [1] "Q_local" "Q_global"

## [3] "mean_R_NX" "AUC_lnK_R_NX"

## [5] "total_correlation" "cophenetic_correlation"

## [7] "distance_correlation" "reconstruction_rmse"

Reducao de dimensionalidade: pacote dimRed

iris.ica <- embed(loadDataSet("Iris"), "FastICA")

quality(iris.ica, "distance_correlation"); plot(iris.ica)

## [1] 0.9149342

ICA1

−1.5 −0.5 0.0 0.5 1.0 1.5

−2

02

●●●

●●

●●

●●

●● ●●●

●●●

●●

●●

●●

●●

●●

●●

●●

●●

● ●● ●

● ●●

●●

●●●

●●

●●

●● ●

●●

● ●

●●

●●●

●●

●●

●●

●●

●●●

●●

● ●

●●●

●●

●● ●

●●

−2 −1 0 1 2 3

−1.

50.

01.

5

●●●● ● ●● ●● ● ●●●● ● ●●● ●●●●●

●●● ● ●●●● ●● ●

● ● ●●● ●●●●

● ●● ●● ●●

●●●

●●● ●

●●●

●●●

●●●

●●

●●●

●●

● ●●●

●●● ●

●● ● ●●

●●● ●●

●● ●● ●

●●

● ●●

●● ●

●● ●

● ● ●●

●●

●●

●● ●

●●

● ●● ●

●●

●●

●●●

●●●●

●●●● ●●●

ICA2

Exercıcios: mapaI Fazer grafico de rotas entre aeroportos usando flightsI Posicoes geograficas: airports.dat [link]. Segue exemplo:

airports <- tbl_df(read.csv('airports.dat',header=FALSE))

colnames(airports) <- c('ID', 'name', 'city', 'country',

'IATA_FAA','ICAO','lat','lon',

'alt','timez','T','DST', 'arpt','O')

select(airports,lat,lon,IATA_FAA,country) %>%

filter(IATA_FAA %in% c("LGA","MIA"))

## # A tibble: 2 × 4

## lat lon IATA_FAA country

## <dbl> <dbl> <fctr> <fctr>

## 1 25.7932 -80.2906 MIA United States

## 2 40.7772 -73.8726 LGA United States

#filter(airports, country == 'Brazil')

#%>% select(IATA_FAA,lat,lon)

Mapas: rworldmap

aeroportos <- filter(airports, country == 'Brazil') %>%

select(lat,lon)

plot(getMap(), xlim=c(-60,-50), ylim=c(-35, 10))

points(aeroportos$lon, aeroportos$lat,

col='red',pch=20,cex=.4)

lines(aeroportos[1:2,]$lon, aeroportos[1:2,]$lat, col='blue')

●●

●●

●●

●●

●●

● ●

● ●

●●

● ●

● ●

●●

●●

●●

●●●

●●

● ●

● ●

Trabalho 1

I Dataset: “Cessoes de Direitos Minerarios”I https://app.dnpm.gov.br/DadosAbertos/SCM/Cessoes_de_

Direitos.csvI Na linha 16934, trocar “D”” por D

I Fazer relatorio contendo estatısticas-resumo, testes de hipoteses egraficos que provem informacoes para responder as seguintesperguntas:

I Quais sao os estados que mais e menos concedem direitosminerarios?

I Quais as substancias mais procuradas nos ultimos 2,5 e 10 anos?I Existem empresas que recebem um numero de concessoes muito

superior a outras?I Quais sao as empresas que tem monopolios/duopolios nacionais de

extracao de substancias?I Existe regioes de concentracao geografica de concessoes?

I Lat/lon das cidades brasileiras [link]

I Entregar arquivos “.Rnw” e “.pdf”

Exercıcios

I Formule perguntas interessantes que podem ser respondidas commineracao de dados do dataset que lhe for atribuıdo.

I Realize um estudo exploratorio sobre variaveis relevantes pararesponder as perguntas formuladas.

Recommended