Handling SCMP Topics

Han Oostdijk

2019/01/28

Introduction

Because I am interested in the development of Xiongan New Area nearby Beijing, I regularly check the South China Morning Post topic on that subject. I use the following code to retrieve the relevant articles in the webpage SCMP Xiongan topics :

# do_rvest.scmp.R

source("./R/rvest_scmp.R")

do_it <- function (version = 'v2', suf = 'test') {
	tmp_table = read_scmp_topics(
		"https://www.scmp.com/topics/xiongan-new-area",
		lastpage = -1,
		version = version
	)  %>%
		dplyr::arrange(desc(d))

	if (stringr::str_length(suf) == 0) {
		suf1 = ''
		suf2 = ''
		draft1 = 'false'
	} else {
		suf1 = glue::glue(' {suf}')
		suf2 = glue::glue('-{suf}')
		draft1 = 'true'
	}

	topictable_to_web_page(
		intable = tmp_table,
		outfile = glue::glue("./content/xiong-an/xiong-an-scmp-topics{suf2}.md"),
		title   = glue::glue('SCMP Xiongan topics{suf1}'),
		slug    = glue::glue('scmp-xiongan-topics{suf2}'),
		draft   = draft1,
		categories = c('China', 'Xiongan'),
		tags    = c('China', 'Xiongan')
	)
}

do_it(version = 'v2', suf = 'test')

The two functions in ‘rvest_scmp.R’ read_scmp_topics (to collect the data) and topictable_to_web_page (to format the webpage) are not dependent on the specific topic. The contents of ‘rvest_scmp.R’ is listed here:

# rvest_scmp.R

library(rvest)

read_scmp_topics <- function (url,
	lastpage = 1,
	version = 'v2') {
	# read title, href and date from scmp topic page into table

	# version dependent functions
	nodes1 <- function(topics) {
		# retrieve main nodes for given version
		if (version == 'v1') {
			html_nodes(topics, '.view-mode-lvl_3 , .view-mode-lvl_11')
		} else {
			html_nodes(topics, '.article-level-three , .article-level-ten')
		}
	}
	nodes2 <- function(topics) {
		# retrieve sub nodes for given version
		if (version == 'v1') {
			html_nodes(topics, '.node-title a')
		} else {
			html_nodes(topics, '.article__link')
		}
	}
	nodes3 <- function(topic) {
		# retrieve date for given version
		if (version == 'v1') {
			html_nodes(topic, '.node-title ~ time') %>% # subnode with date
				html_attr('datetime') %>% # datetime part
				stringr::str_sub(end = 10) # date as YYYY-MM-DD
		} else {
			html_nodes(topic, '.status-left__time') %>% # subnode with date
				html_text() %>% # datetime part
				stringr::str_sub(end = 11) %>% # date as DD MMM YYYY
				as.Date(., format = '%d %B %Y') # date as YYY-MM-DD
		}
	}

	fullurl <- function(url) {
		if (stringr::str_sub(url, start = 1, end = 4) == 'http') {
			url
		} else {
			paste0('https://www.scmp.com', url)
		}
	}

	# collect xml_nodeset with references to topics
	topics <-  read_html(url) %>%
		nodes1()
	# and same for following pages
	page = 0
	while (page <= lastpage) {
		page = page + 1
		topics2 <-  glue::glue('{url}?page={page}') %>%
			read_html()  %>%
			nodes1()
		topics = c(topics, topics2)
	}

	# collect from each topic the title, href and date
	purrr::map_dfr(topics,
		function(ns) {
			nt <- ns %>%
				nodes2() # subnode with title and href
			t1 <- nt %>%
				html_text() %>% # title part
				stringr::str_remove_all(., '\n') %>% # remove newline characters
				stringr::str_trim(side = 'both') # remove white space characters
			h1 <- nt %>%
				html_attr('href') %>% # href part
				fullurl() # to full url
			d1 <- ns %>%
				nodes3()
			if (length(d1) == 0)
				d1 = ' ' # no date then set empty string to blank
			tibble::tibble(t = t1, h = h1, d = d1) # to table format
		})
}

topictable_to_web_page <-
	function (intable,
		# table with title, link and date for topics
		outfile = "outfile", 	# path to output md file
		title = "title", 		  # title of webpage
		slug = "slug",		    # name to use in Hugo website
		draft = 'false',      # publicize (draft == 'false')
		categories = c(), 		# categories to add
		tags = c()            # tags to add
		)
		{
			intable2 = intable %>%
				dplyr::mutate(d = format(as.Date(d), format = '%d %B %Y')) %>%
				dplyr::mutate(d = ifelse(is.na(d), ' ', d))
			th1 = purrr::pmap_chr(intable2,  ~ glue::glue('{..3}<br>[{..1}]({..2})<br> '))

			# format current date in yyyy-mm-dd format
			page_date = format(Sys.Date(), format = '%Y-%m-%d')
			# create yaml header for md file
			header = c(
				"---",
				glue::glue("title: {title}"),
				glue::glue("date: '{page_date}'"),
				glue::glue("slug: '{slug}'"),
				glue::glue("draft: '{draft}'"),
				glue::glue(
					"categories: [{tcats}]",
					tcats = glue::glue_collapse(categories, sep = ',')
				),
				glue::glue("tags: [{ttags}]", ttags = glue::glue_collapse(tags, sep =
						',')),
				"---",
				" "
			)
			# write header and data to md outfile
			cat(header, th1,
				sep = '\n', file = (con <-
						file(outfile, "w", encoding = "UTF-8")))
			close(con)
		}