Intro

- 가온 차트(Gaon Chart)는 대한민국의 대중음악 공인차트이다. 가온이라는 단어는 가운데, 중심이라는 뜻의 순우리말로, 중심이 되는 차트라는 의미에서 명명되었다. 한국음악콘텐츠협회가 운영하고 문화체육관광부가 후원하는 사업으로, 2년여 준비기간을 걸쳐 2010년 2월 23일 출범했다.

Crawling

if (!require(pacman)) install.packages('pacman'); library(pacman)
pacman::p_load("rvest", "tidyverse")

getLinks <- function(termGbn){ # Input 'week' or 'month'
    base_url <- 'http://gaonchart.co.kr/main/section/chart/online.gaon?nationGbn=T&serviceGbn=ALL'
    sample_url <- paste0(base_url, '&termGbn=', termGbn)
    dates <-     
        read_html(sample_url) %>%
        html_nodes('div[class=fr]') %>%
        html_nodes('select') %>%
        html_nodes('option') %>%
        html_attr('value') %>%
        str_subset(pattern = '^[0-9]{6}$')
    
    # 'dates'(YYYYMM) will be divided into hitYear(YYYY) and targetTime(MM).
    # Ex) dates <- 202221.
    hitYear <-
        dates %>%
        substr(1,4) # Ex) hitYear <- 2022.

    targetTime <-
        dates %>%
        substr(5,6) # Ex) hitYear <- 21.
    
    complete_url <- paste0(base_url, '&targetTime=', targetTime, '&hitYear=', hitYear, '&termGbn=', termGbn)
    return(complete_url) # Return all urls
}

getPage <- function(target_url){ # Input complete URL including targetTime, hitYear, and termGbn
    
    url_splited <- 
        target_url %>%
        strsplit('=|&') %>%
        unlist()
    
    year_temp <- 
        url_splited %>%
        str_subset('^[0-9]{2,4}$') %>%
        rev() %>%
        paste0(collapse = '')
    
    termGbn <- 
        url_splited %>%
        '['(length(url_splited))

    html_chart <- read_html(target_url) %>%
        html_nodes('div[class=chart]')

    td <- html_chart %>% 
        html_nodes('td') %>%
        html_text() %>%
        str_split('\\n|\\||\\r|\\t') %>%
        unlist() %>%
        str_subset('^$', negate = TRUE)

    change_grp <- html_chart %>%
        html_nodes('td[class=change]') %>%
        html_nodes('span') %>%
        html_attr('class')
    
    # sort by 'PLAY'
    ## Ex) 
    ## "~~", "~~", "PLAY", "~~", "~~", "PLAY", "~~", "~~", "PLAY"
    ## to
    ## |~~|~~|'PLAY'|
    ## |~~|~~|'PLAY'|
    ## |~~|~~|'PLAY'|
    
    # 'Gaon Score' was newly created in Jan 2018. 
    ## Before Jan 2018 : chart_piece has 11 columns.
    ## After Jan 2018 : chart_piece has 12 columns. 
    index_PLAY <- str_which(td, 'PLAY')
    PLAY_ZONE <- c(11, 12)
    index_TRUE_PLAY <- index_PLAY[index_PLAY %in% PLAY_ZONE][1]

    chart_piece <- td %>% 
        matrix(ncol = index_TRUE_PLAY, byrow = TRUE) %>%
        as_tibble()

    # Note : ifelse always returns an object of the same length as the condition. so we use if/else for this case.
    chart <- tibble(year_temp = year_temp,
                        ranking = chart_piece$V1,
                        change_grp = change_grp,
                        change_val = chart_piece$V2,
                        title = chart_piece$V3,
                        artist = chart_piece$V4,
                        gaon_index = if (ncol(chart_piece) == 12) chart_piece$V6 else NA,  
                        production = if(ncol(chart_piece) == 12) chart_piece$V7 else chart_piece$V6,
                        distribution = if(ncol(chart_piece) == 12) chart_piece$V8 else chart_piece$V7)

    chart$gaon_index <- chart$gaon_index %>% 
                            str_replace_all('[^0-9]', '') %>%
                            as.numeric()
                    
    colnames(chart)[1] <- ifelse(termGbn == 'week', 'year_week', 'year_month')
    return(chart) # Return chart of the 'target_url'.
}
 

# Weekly Rankings.
# Time to run : 7~8min.
weekLinks <- getLinks('week')
all_week_pages <- vector('list', length(weekLinks))

for (i in 1:length(all_week_pages)) all_week_pages[[i]] <- getPage(weekLinks[i])
week_final <- do.call('rbind', all_week_pages)
write.table(week_final, 'gaon_week.txt', row.names = FALSE)

# Monthly Rankings.
# Time to run : 1~2min.
monthLinks <- getLinks('month')
all_month_pages <- vector('list', length(monthLinks))
for (i in 1:length(all_month_pages)) all_month_pages[[i]] <- getPage(monthLinks[i])
month_final <- do.call('rbind', all_month_pages)
write.table(month_final, 'gaon_month.txt', row.names = FALSE)
필요한 패키지를 로딩중입니다: pacman

Warning message:
"패키지 'pacman'는 R 버전 4.1.3에서 작성되었습니다"
Warning message:
"The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
Using compatibility `.name_repair`.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated."
head(week_final)
A tibble: 6 × 9
year_week ranking change_grp change_val title artist gaon_index production distribution
<chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr>
202221 1 <span style=white-space:pre-wrap>NA </span> - That That (prod. & feat. SUGA of BTS) <span style=white-space:pre-wrap>싸이 (Psy) </span> 31776501 <span style=white-space:pre-wrap>피네이션 </span> <span style=white-space:pre-wrap>Dreamus </span>
202221 2 up 2 LOVE DIVE IVE (아이브) 27807024 스타쉽엔터테인먼트 Kakao Entertainment
202221 3 up 2 TOMBOY (여자)아이들 27241781 큐브엔터테인먼트 Kakao Entertainment
202221 4 down 2 봄여름가을겨울 (Still Life) BIGBANG (빅뱅) 25590419 YG Entertainment YG PLUS
202221 5 up 1 사랑인가 봐 멜로망스(Melomance) 22846960 플렉스엠 Kakao Entertainment
202221 6 down 3 다시 만날 수 있을까 임영웅 19074593 물고기뮤직 Dreamus
head(month_final)
A tibble: 6 × 9
year_month ranking change_grp change_val title artist gaon_index production distribution
<chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr>
202204 1 new new 봄여름가을겨울 (Still Life) BIGBANG (빅뱅) 168901052 YG Entertainment YG PLUS
202204 2 up 1 TOMBOY (여자)아이들 133304108 큐브엔터테인먼트 Kakao Entertainment
202204 3 up 23 Feel My Rhythm 레드벨벳(Red Velvet) 111957588 SM Entertainment Dreamus
202204 4 new new LOVE DIVE IVE (아이브) 105286003 스타쉽엔터테인먼트 Kakao Entertainment
202204 5 up 3 사랑인가 봐 멜로망스(Melomance) 100850288 플렉스엠 Kakao Entertainment
202204 6 down 4 GANADARA (Feat. 아이유) 박재범 96954973 MORE VISION Kakao Entertainment

Analysis

  • 미완성