Introduction
Because I am interested in the development of Xiongan New Area nearby Beijing, I regularly check the South China Morning Post topic on that subject. I use the following code to retrieve the relevant articles in the webpage SCMP Xiongan topics :
# do_rvest.scmp.R
source("./R/rvest_scmp.R")
do_it <- function (version = 'v2', suf = 'test') {
tmp_table = read_scmp_topics(
"https://www.scmp.com/topics/xiongan-new-area",
lastpage = -1,
version = version
) %>%
dplyr::arrange(desc(d))
if (stringr::str_length(suf) == 0) {
suf1 = ''
suf2 = ''
draft1 = 'false'
} else {
suf1 = glue::glue(' {suf}')
suf2 = glue::glue('-{suf}')
draft1 = 'true'
}
topictable_to_web_page(
intable = tmp_table,
outfile = glue::glue("./content/xiong-an/xiong-an-scmp-topics{suf2}.md"),
title = glue::glue('SCMP Xiongan topics{suf1}'),
slug = glue::glue('scmp-xiongan-topics{suf2}'),
draft = draft1,
categories = c('China', 'Xiongan'),
tags = c('China', 'Xiongan')
)
}
do_it(version = 'v2', suf = 'test')
The two functions in ‘rvest_scmp.R’ read_scmp_topics
(to collect the data) and topictable_to_web_page
(to format the webpage) are not dependent on the specific topic. The contents of ‘rvest_scmp.R’ is listed here:
# rvest_scmp.R
library(rvest)
read_scmp_topics <- function (url,
lastpage = 1,
version = 'v2') {
# read title, href and date from scmp topic page into table
# version dependent functions
nodes1 <- function(topics) {
# retrieve main nodes for given version
if (version == 'v1') {
html_nodes(topics, '.view-mode-lvl_3 , .view-mode-lvl_11')
} else {
html_nodes(topics, '.article-level-three , .article-level-ten')
}
}
nodes2 <- function(topics) {
# retrieve sub nodes for given version
if (version == 'v1') {
html_nodes(topics, '.node-title a')
} else {
html_nodes(topics, '.article__link')
}
}
nodes3 <- function(topic) {
# retrieve date for given version
if (version == 'v1') {
html_nodes(topic, '.node-title ~ time') %>% # subnode with date
html_attr('datetime') %>% # datetime part
stringr::str_sub(end = 10) # date as YYYY-MM-DD
} else {
html_nodes(topic, '.status-left__time') %>% # subnode with date
html_text() %>% # datetime part
stringr::str_sub(end = 11) %>% # date as DD MMM YYYY
as.Date(., format = '%d %B %Y') # date as YYY-MM-DD
}
}
fullurl <- function(url) {
if (stringr::str_sub(url, start = 1, end = 4) == 'http') {
url
} else {
paste0('https://www.scmp.com', url)
}
}
# collect xml_nodeset with references to topics
topics <- read_html(url) %>%
nodes1()
# and same for following pages
page = 0
while (page <= lastpage) {
page = page + 1
topics2 <- glue::glue('{url}?page={page}') %>%
read_html() %>%
nodes1()
topics = c(topics, topics2)
}
# collect from each topic the title, href and date
purrr::map_dfr(topics,
function(ns) {
nt <- ns %>%
nodes2() # subnode with title and href
t1 <- nt %>%
html_text() %>% # title part
stringr::str_remove_all(., '\n') %>% # remove newline characters
stringr::str_trim(side = 'both') # remove white space characters
h1 <- nt %>%
html_attr('href') %>% # href part
fullurl() # to full url
d1 <- ns %>%
nodes3()
if (length(d1) == 0)
d1 = ' ' # no date then set empty string to blank
tibble::tibble(t = t1, h = h1, d = d1) # to table format
})
}
topictable_to_web_page <-
function (intable,
# table with title, link and date for topics
outfile = "outfile", # path to output md file
title = "title", # title of webpage
slug = "slug", # name to use in Hugo website
draft = 'false', # publicize (draft == 'false')
categories = c(), # categories to add
tags = c() # tags to add
)
{
intable2 = intable %>%
dplyr::mutate(d = format(as.Date(d), format = '%d %B %Y')) %>%
dplyr::mutate(d = ifelse(is.na(d), ' ', d))
th1 = purrr::pmap_chr(intable2, ~ glue::glue('{..3}<br>[{..1}]({..2})<br> '))
# format current date in yyyy-mm-dd format
page_date = format(Sys.Date(), format = '%Y-%m-%d')
# create yaml header for md file
header = c(
"---",
glue::glue("title: {title}"),
glue::glue("date: '{page_date}'"),
glue::glue("slug: '{slug}'"),
glue::glue("draft: '{draft}'"),
glue::glue(
"categories: [{tcats}]",
tcats = glue::glue_collapse(categories, sep = ',')
),
glue::glue("tags: [{ttags}]", ttags = glue::glue_collapse(tags, sep =
',')),
"---",
" "
)
# write header and data to md outfile
cat(header, th1,
sep = '\n', file = (con <-
file(outfile, "w", encoding = "UTF-8")))
close(con)
}