if (!require(rvest)) install.packages('rvest')
library(rvest)
if (!require(tidyverse)) install.packages('tidyverse')
library(tidyverse)
get_kings_info = function(order, nation){ # ex) order = 1, nation = 'Ming' or 'King'
ming_url = 'https://en.wikipedia.org/wiki/List_of_emperors_of_the_Ming_dynasty'
qing_url = 'https://en.wikipedia.org/wiki/List_of_emperors_of_the_Qing_dynasty'
target_url = ifelse(nation == 'Ming', ming_url, qing_url)
dynasty = ifelse(nation == 'Ming', 'Ming', 'Qing')
unclean_table =
(target_url %>%
read_html %>%
html_nodes('table[class=wikitable]'))[1] %>%
html_nodes('tbody') %>%
html_nodes('tr') %>%
html_text()
# 정규표현식을 이용해 양식을 맞춘다
requiredRows_index = str_detect(unclean_table, '[A-Za-z]{5,}\\([0-9]{1,2}\\s[A-Z]{1}')
requiredRows = unclean_table[requiredRows_index]
clean_table = gsub('\n', '', requiredRows[order]) %>%
strsplit('') %>%
unlist()
name_start_index = 1
name_end_index = grep('\\(', clean_table)[1]-1 # 괄호보다 한 칸 이전에 있으므로 -1
name =
clean_table[name_start_index:name_end_index] %>%
paste(collapse = '')
only_numbers =
requiredRows[order] %>% # 숫자 존재하는 벡터
strsplit(split = '[^0-9]') %>%
unlist()
year_index = grep('.{4}', only_numbers)[1:2] # 월, 일은 2글자를 초과하지 못하므로 자연스럽게 네글자만 연도이다.
year = only_numbers[year_index]
return(c(dynasty, name, year))
}
ming_last_order = 16
qing_last_order = 12
ming_kings_info <- vector('list', ming_last_order)
qing_kings_info <- vector('list', qing_last_order)
for (i in 1:ming_last_order) ming_kings_info[[i]] = get_kings_info(i, 'Ming')
for (i in 1:qing_last_order) qing_kings_info[[i]] = get_kings_info(i, 'Qing')
kings_info = do.call('rbind', c(ming_kings_info, qing_kings_info))
colnames(kings_info) <- c('dynasty', 'name', 'birth', 'death')
write.table(kings_info, 'China_king.txt', row.names = FALSE)