ECON 413
Web scrapping and text analysis
Erol Taymaz
Department of Economics
Middle East Technical University
@import url(‘https://fonts.googleapis.com/css?family=Source+Sans+Pro:200,300,400,400i,600,700,900&subset=latin-ext’);
a {
color: #cf4170;
text-decoration: none;
}
a:visited {color: #b12c58}
a:hover {color: #cf4170; text-decoration: underline;}
img {
width: 100%;
max-width: 1600px;
}
@media screen and (max-width: 800px) {
img {
width: 30%;
}
}
.section h2 {
margin-bottom: 20px;
border-bottom: 1px solid #ccc;
}
rselenium package
rvest package
Used car prices in Turkey
library(rvest)
library(data.table)
library(ggplot2)
url <- c("https://www.sahibinden.com/bmw/sahibinden?pagingSize=50")
fp <- read_html(url)
str(fp)
## List of 2
## $ node:<externalptr>
## $ doc :<externalptr>
## - attr(*, "class")= chr [1:2] "xml_document" "xml_node"
## [1] "Sahibinden BMW Fiyatları & Modelleri"
## [2] "12.122 ilan"
## [1] "12.122 ilan"
tablo <- html_nodes(fp, "#searchResultsTable")
veri <- html_table(tablo[[1]], header = T, trim = TRUE, dec = ",", fill =TRUE)
head(veri)
## Seri Model
## 1 1 Serisi 116d First Edition M Sport
## 2 4 Serisi 440i xDrive
## 3 3 Serisi 320d M Sport
## 4
## 5 <NA> <NA> <NA>
## 6 5 Serisi 520i Premium
## İlan Başlığı Yıl
## 1 2020 MODEL SIFIR BMW 1.16D F1 KOLTUK PAZARLIK YOK 2020
## 2 2020 BMW 440i XDRIVE M SPORT - LASER LIGHT+19''JANT+HARMAN 374BG 2018
## 3 180 BİNDE HATASIZ BOYASIZ DEĞİŞENSİZ BOL EKSTRALI TRNİN EN İYİSİ 2008
## 4
## 5 <NA> <NA>
## 6 Sahibinden Hatasız Garaj arabası 2015
## KM Renk Fiyat İlan Tarihi
## 1 10 Gri 440.900 TL 07 Ocak\n 2021
## 2 950 Gri 2.250.000 TL 07 Ocak\n 2021
## 3 180.000 Gri 235.000 TL 07 Ocak\n 2021
## 4
## 5 <NA> <NA> <NA> <NA>
## 6 106.000 Beyaz 421.500 TL 06 Ocak\n 2021
## İl / İlçe
## 1 İzmirKarabağlar
## 2 İstanbulBeyoğlu
## 3 KocaeliKörfez
## 4
## 5 <NA> <NA>
## 6 SakaryaKocaali
veri<-data.table(veri)
veri[, Fiyat := sub(" TL", "", Fiyat)]
veri[, Fiyat := sub("\\.", "", Fiyat)]
veri[, Fiyat := as.numeric(Fiyat)]
veri <- veri[!is.na(Fiyat)]
mean(veri$Fiyat)
## [1] 424368.4
## Classes 'data.table' and 'data.frame': 50 obs. of 11 variables:
## $ V1 : chr "" "" "" "" ...
## $ Seri : chr "1 Serisi" "4 Serisi" "3 Serisi" "5 Serisi" ...
## $ Model : chr "116d First Edition M Sport" "440i xDrive" "320d M Sport" "520i Premium" ...
## $ İlan Başlığı: chr "2020 MODEL SIFIR BMW 1.16D F1 KOLTUK PAZARLIK YOK" "2020 BMW 440i XDRIVE M SPORT - LASER LIGHT+19''JANT+HARMAN 374BG" "180 BİNDE HATASIZ BOYASIZ DEĞİŞENSİZ BOL EKSTRALI TRNİN EN İYİSİ" "Sahibinden Hatasız Garaj arabası" ...
## $ Yıl : chr "2020" "2018" "2008" "2015" ...
## $ KM : chr "10" "950" "180.000" "106.000" ...
## $ Renk : chr "Gri" "Gri" "Gri" "Beyaz" ...
## $ Fiyat : num 440900 2250 235000 421500 307500 ...
## $ İlan Tarihi : chr "07 Ocak\n 2021" "07 Ocak\n 2021" "07 Ocak\n 2021" "06 Ocak\n 2021" ...
## $ İl / İlçe : chr "İzmirKarabağlar" "İstanbulBeyoğlu" "KocaeliKörfez" "SakaryaKocaali" ...
## $ V2 : chr "" "" "" "" ...
## - attr(*, ".internal.selfref")=<externalptr>
veri[, Yıl := as.numeric(Yıl)]
veri[, Yas := 2020 - Yıl]
ggplot(veri) + geom_point(aes(x = Yas, y = Fiyat)) +
geom_smooth(aes(y = Fiyat, x = Yas), method = "lm", se = FALSE) +
theme_bw()
ggplot(veri) + geom_point(aes(x = Yas, y = log(Fiyat))) +
geom_smooth(aes(y = log(Fiyat), x = Yas), method = "lm", se = FALSE) +
theme_bw()
##
## Call:
## lm(formula = log(Fiyat) ~ Yas, data = veri)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5791 0.0412 0.2121 0.3901 0.9777
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.10021 0.24915 52.580 <2e-16 ***
## Yas -0.07487 0.03059 -2.447 0.0181 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.162 on 48 degrees of freedom
## Multiple R-squared: 0.1109, Adjusted R-squared: 0.09242
## F-statistic: 5.99 on 1 and 48 DF, p-value: 0.0181
University locations in Turkey
# Read the web page
phtml <- read_html("https://en.wikipedia.org/wiki/List_of_universities_in_Turkey")
# Find the "table" nodes
ptables <- html_nodes(phtml, "table")
# Lits tables
ptables
## {xml_nodeset (10)}
## [1] <table class="wikitable sortable"><tbody>\n<tr>\n<th>Province</th>\n<th> ...
## [2] <table class="wikitable sortable"><tbody>\n<tr>\n<th>Province</th>\n<th> ...
## [3] <table class="wikitable sortable"><tbody>\n<tr>\n<th>Province</th>\n<th> ...
## [4] <table class="wikitable sortable"><tbody>\n<tr style="background:#efefef ...
## [5] <table class="nowraplinks mw-collapsible autocollapse navbox-inner" styl ...
## [6] <table class="nowraplinks mw-collapsible autocollapse navbox-inner" styl ...
## [7] <table class="nowraplinks mw-collapsible expanded navbox-inner" style="b ...
## [8] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbo ...
## [9] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbo ...
## [10] <table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbo ...
# Get 1st and 3rd table * If new tables are added to the page, table number may change
ptable1 <- html_table(ptables[1], header = T, dec = ".", fill=TRUE)
ptable2 <- html_table(ptables[3], header = T, dec = ".", fill=TRUE)
# Get the data from the table
class(ptable1)
## [1] "list"
## [1] "list"
## Province University Type Founded
## 1 Adana Adana Science and Technological University State 2011
## 2 Adana Çukurova University State 1973
## 3 Adıyaman Adıyaman University State 2006
## 4 Afyonkarahisar Afyon Kocatepe University State 1992
## 5 Afyonkarahisar Afyonkarahisar University of Health Sciences State NA
## 6 Ağrı Ağrı İbrahim Çeçen University State 2007
## Province University Type Founded
## 1 Adana Kanuni University Foundation 2013
## 2 Ankara İpek University Foundation 2011
## 3 Ankara Turgut Özal University Foundation 2008
## 4 Ankara Gülhane Military Medical Academy (Military) Special 1898
## 5 Bursa Bursa Orhangazi University Foundation 2011
## 6 Diyarbakır Selahaddin Eyyubi University Foundation 2013
## Closed
## 1 2016
## 2 2016
## 3 2016
## 4 2016
## 5 2016
## 6 2016
## [1] "Province" "University" "Type" "Founded" "Closed"
## [1] "Province" "University" "Type" "Founded" "Closed"
# Get locations
pdat <- setDT(pdat)
# Find universities' location
ploc <- setDT(geocode_OSM(pdat$University, as.data.frame = TRUE))
setnames(ploc, "query", "University")
pdat <- ploc[, .(University, lat, lon)][pdat, on = "University"]
# Find provinces' location and use them for missing values
plocc <- setDT(geocode_OSM(unique(pdat[is.na(lat)]$Province), as.data.frame = TRUE))
pdat[is.na(lat), c("lat", "lon") := .(plocc$lat[match(Province, plocc$query)],
plocc$lon[match(Province, plocc$query)])]
# Get data from the web
# pdat <- read.csv("http://users.metu.edu.tr/etaymaz/econ413/university_data.csv")
# List universities with missing locations
pdat[is.na(pdat$lat),]
## Empty data.table (0 rows and 7 cols): University,lat,lon,Province,Type,Founded...
# Libraries
library(rvest)
library(ggplot2)
library(stringi)
library(data.table)
# Functions --------------------------------------------------------------------
# Preparering page list
sayfaListesiniHazirla <- function(ii) {
cat("Page : ", ii, "\n")
durl <- paste0("https://www.tccb.gov.tr/haberler/?&page=", ii)
doc <- read_html(durl)
doc <- html_node(doc, "#divContentList")
dat <- data.table(tarih = html_text(html_nodes(doc, "dt")),
urls = html_attr(html_nodes(doc, "a"), "href"),
baslik = html_text(html_nodes(doc, "dd")))
return(dat)
}
# Scrapping news
haberleriDerle <- function(url) {
durl <- paste0("https://www.tccb.gov.tr", url)
doc <- read_html(durl)
haber <- html_text(html_node(doc, "#divContentArea"))
return(haber)
}
# Plotting charts
plotKeyWordsChart <- function(x) {
# tumHaberler[, zaman := yil + floor((ay-1)/6)/2]
ff <- vector(mode = "list", length = length(x))
for (i in c(1:length(x))) {
tumHaberler[, sayi := grepl(x[i], haber)]
ff[[i]] <- tumHaberler[, .(kelime = stri_trans_totitle(x[i]),
sayi = sum(sayi), nhaber = .N), by = yil]
}
fff <- do.call("rbind", ff)
gg <- ggplot(fff, aes(x = yil, y = 100*sayi/nhaber,
color = kelime, group = kelime)) +
geom_line(size = 1.5) +
theme_bw(base_size = 24) +
guides(colour = guide_legend(override.aes = list(size=5))) +
labs(x = "", y = "Kelime kullanma sıklığı (%)")
return(gg)
}
# SCRAP THE DATA ---------------------------------------------------------------
# Find the number of pages
# Get news links
tumListe <- lapply(c(1:25), sayfaListesiniHazirla)
## Page : 1
## Page : 2
## Page : 3
## Page : 4
## Page : 5
## Page : 6
## Page : 7
## Page : 8
## Page : 9
## Page : 10
## Page : 11
## Page : 12
## Page : 13
## Page : 14
## Page : 15
## Page : 16
## Page : 17
## Page : 18
## Page : 19
## Page : 20
## Page : 21
## Page : 22
## Page : 23
## Page : 24
## Page : 25
tumHaberler <- do.call("rbind", tumListe)
# Get news - IT MAY TAKE SOME TIME TO RETRIVE THE DATA!!
tumHaberler$haber <- sapply(tumHaberler$urls, haberleriDerle)
# Clean the data -------------------------------------------------------------
# use only lower case
# There could be some problems with Turkish letters
tumHaberler[, haber := stri_trans_tolower(haber, "tr_TR")]
# Remove \r ve \n
tumHaberler[, haber := gsub("\r", " ", haber, fixed = TRUE)]
tumHaberler[, haber := gsub("\n", " ", haber, fixed = TRUE)]
# Add date variable
tumHaberler[, ay := as.integer(substr(tarih, 4, 5))]
tumHaberler[, yil := as.integer(substr(tarih, 7, 10))]
# Save the data
# save(tumHaberler, file="Data/all_news_5December2020_raw.Rdata")
# Word list --------------------------------------------------------------
wlist <- c("trump", "putin", "merkel", "kilicdaroglu")
# Plot it
plotKeyWordsChart(wlist)