Introduction
Because I am interested in the development of Xiongan New Area nearby Beijing, I am subscribed to Google Alerts on that subject. For this website I keep track of these alerts in two entries:
- Google Alerts ‘xiongan new area (latest only)’ (the latest download). This entry is no longer included in the website because it is generated with “draft: true”
- Google Alerts ‘xiongan new area’ (all until now).
The first is created by executing the R
code in the next section. The latter contains also entries that for one reason or other are no longer in the latest download. It is created manually by adding to the previous version the new entries of the latest download and changing the date (when I remember to do that).
Code for creating the page for the latest download.
library(rvest)
rvest_alerts <-
function(url,
outfile = "outfile", # path to output md file
title = "title", # text to add to 'Google Alerts' in title
slug = "slug" # name to use in Hugo website
)
{
# Google Alerts produces a page that with alerts to webpages
# This function creates a md file with links to these webpages
# retrieve (copy of) webpage with Google Alerts
html1 <- url %>%
read_html()
# retrieve nodes with title and link
msg1 <- html1 %>%
html_nodes("h4 a")
# retrieve node with latest date and convert to Date value
page_date <- html1 %>%
html_nodes(".date") %>%
html_text() %>%
lubridate::parse_date_time("%B %d, %Y")
# retrieve nodes with 'xx days ago' and extract xx
a1 <- html1 %>%
html_nodes(".age") %>%
html_text() %>%
stringr::str_extract(., '(\\d+)') %>%
as.numeric()
# retrieve text of links
t1 <- msg1 %>% html_text()
# retrieve href of links and extract original url
h1 <- msg1 %>% html_attr('href') %>%
stringr::str_extract("(?<=url=)(.*)") %>%
stringr::str_extract("([^&]*)(?=&)")
# create lines with webpage references in markdown format
th1 = purrr::map2_chr(t1, h1, ~ glue::glue('- [{.x}]({.y}) '))
# to intertwine the links with the dates we locate
# the start position of the a1 and h1 elements created above
htmlc = as.character(html1)
sp = purrr::map(
stringr::str_locate_all(htmlc,
c('<span class="age"', '<a class="result_title_link')
),
~ .[, 1]) %>%
setNames(c('A', 'L'))
# convert the list to a tibble and sort on location
sptab = purrr::imap_dfr(sp,
~ tibble::tibble(
x = .x, # location
t = .y, # type (A or L)
n = seq(length(.x)) # sequence number
)
) %>%
dplyr::arrange(x)
# remove objects no longer needed
rm(list = c('h1', 'html1', 'htmlc', 'msg1', 'sp', 't1'))
# simulate and insert a '0 days ago' line
# when there is a reference for `page_date`
if (sptab$t[1] == 'L') {
a1 = c(0, a1)
sptab = rbind(tibble::tibble(x = 0, t = 'A', n = 0), sptab)
sptab = sptab %>% dplyr::mutate(n=ifelse(t=='A',n+1,n))
}
# convert 'xx days ago' to actual date and format date line
# (prepend and append an empty line)
a1 <-
format(page_date - lubridate::days(a1), format = '%d %B %Y') %>%
purrr::map(~ c(' ', glue::glue('### {.x} '), ' '))
# walk through sptab selecting dates and links in right order
sptab = sptab %>%
dplyr::mutate(r = ifelse(t == 'A', a1[n], th1[n])) %>%
dplyr::select(r) %>%
unlist()
# format date in yyyy-mm-dd format
page_date = format(page_date, format = '%Y-%m-%d')
# create yaml header for md file
header = c(
"---",
glue::glue("title: Google Alerts '{title}'"),
glue::glue("date: '{page_date}'"),
glue::glue("slug: '{slug}'"),
"draft: true",
"categories: []",
"tags: []",
"---",
" "
)
# write header and data to md outfile
cat( header, sptab,
sep = '\n', file = ( con <- file(outfile,"w",encoding="UTF-8") )
)
close(con)
}
# url <- "D:/data/magweg/Google Alerts 27jan2019.html"
# rvest_alerts(url, # (copy of) Google Alerts webpage
# outfile = "./content/xiong-an/xiong-an-news-latest.md",
# title = 'xiongan new area (latest only)',
# slug = 'xiong-an-news-latest')