Gaon Chart Crawling
- 가온 차트(Gaon Chart)는 대한민국의 대중음악 공인차트이다. 가온이라는 단어는 가운데, 중심이라는 뜻의 순우리말로, 중심이 되는 차트라는 의미에서 명명되었다. 한국음악콘텐츠협회가 운영하고 문화체육관광부가 후원하는 사업으로, 2년여 준비기간을 걸쳐 2010년 2월 23일 출범했다.
if (!require(pacman)) install.packages('pacman'); library(pacman)
pacman::p_load("rvest", "tidyverse")
getLinks <- function(termGbn){ # Input 'week' or 'month'
    base_url <- 'http://gaonchart.co.kr/main/section/chart/online.gaon?nationGbn=T&serviceGbn=ALL'
    sample_url <- paste0(base_url, '&termGbn=', termGbn)
    dates <-     
        read_html(sample_url) %>%
        html_nodes('div[class=fr]') %>%
        html_nodes('select') %>%
        html_nodes('option') %>%
        html_attr('value') %>%
        str_subset(pattern = '^[0-9]{6}$')
    
    # 'dates'(YYYYMM) will be divided into hitYear(YYYY) and targetTime(MM).
    # Ex) dates <- 202221.
    hitYear <-
        dates %>%
        substr(1,4) # Ex) hitYear <- 2022.
    targetTime <-
        dates %>%
        substr(5,6) # Ex) hitYear <- 21.
    
    complete_url <- paste0(base_url, '&targetTime=', targetTime, '&hitYear=', hitYear, '&termGbn=', termGbn)
    return(complete_url) # Return all urls
}
getPage <- function(target_url){ # Input complete URL including targetTime, hitYear, and termGbn
    
    url_splited <- 
        target_url %>%
        strsplit('=|&') %>%
        unlist()
    
    year_temp <- 
        url_splited %>%
        str_subset('^[0-9]{2,4}$') %>%
        rev() %>%
        paste0(collapse = '')
    
    termGbn <- 
        url_splited %>%
        '['(length(url_splited))
    html_chart <- read_html(target_url) %>%
        html_nodes('div[class=chart]')
    td <- html_chart %>% 
        html_nodes('td') %>%
        html_text() %>%
        str_split('\\n|\\||\\r|\\t') %>%
        unlist() %>%
        str_subset('^$', negate = TRUE)
    change_grp <- html_chart %>%
        html_nodes('td[class=change]') %>%
        html_nodes('span') %>%
        html_attr('class')
    
    # sort by 'PLAY'
    ## Ex) 
    ## "~~", "~~", "PLAY", "~~", "~~", "PLAY", "~~", "~~", "PLAY"
    ## to
    ## |~~|~~|'PLAY'|
    ## |~~|~~|'PLAY'|
    ## |~~|~~|'PLAY'|
    
    # 'Gaon Score' was newly created in Jan 2018. 
    ## Before Jan 2018 : chart_piece has 11 columns.
    ## After Jan 2018 : chart_piece has 12 columns. 
    index_PLAY <- str_which(td, 'PLAY')
    PLAY_ZONE <- c(11, 12)
    index_TRUE_PLAY <- index_PLAY[index_PLAY %in% PLAY_ZONE][1]
    chart_piece <- td %>% 
        matrix(ncol = index_TRUE_PLAY, byrow = TRUE) %>%
        as_tibble()
    # Note : ifelse always returns an object of the same length as the condition. so we use if/else for this case.
    chart <- tibble(year_temp = year_temp,
                        ranking = chart_piece$V1,
                        change_grp = change_grp,
                        change_val = chart_piece$V2,
                        title = chart_piece$V3,
                        artist = chart_piece$V4,
                        gaon_index = if (ncol(chart_piece) == 12) chart_piece$V6 else NA,  
                        production = if(ncol(chart_piece) == 12) chart_piece$V7 else chart_piece$V6,
                        distribution = if(ncol(chart_piece) == 12) chart_piece$V8 else chart_piece$V7)
    chart$gaon_index <- chart$gaon_index %>% 
                            str_replace_all('[^0-9]', '') %>%
                            as.numeric()
                    
    colnames(chart)[1] <- ifelse(termGbn == 'week', 'year_week', 'year_month')
    return(chart) # Return chart of the 'target_url'.
}
 
# Weekly Rankings.
# Time to run : 7~8min.
weekLinks <- getLinks('week')
all_week_pages <- vector('list', length(weekLinks))
for (i in 1:length(all_week_pages)) all_week_pages[[i]] <- getPage(weekLinks[i])
week_final <- do.call('rbind', all_week_pages)
write.table(week_final, 'gaon_week.txt', row.names = FALSE)
# Monthly Rankings.
# Time to run : 1~2min.
monthLinks <- getLinks('month')
all_month_pages <- vector('list', length(monthLinks))
for (i in 1:length(all_month_pages)) all_month_pages[[i]] <- getPage(monthLinks[i])
month_final <- do.call('rbind', all_month_pages)
write.table(month_final, 'gaon_month.txt', row.names = FALSE)
head(week_final)
head(month_final)
- 미완성