-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCRAN_packages.R
99 lines (74 loc) · 2.79 KB
/
CRAN_packages.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# This code by Gergely Daróczi
# from https://gist.github.com/daroczig/3cf06d6db4be2bbe3368
CRAN_page <- function(...) {
file.path('https://cran.rstudio.com/src/contrib', ...)
}
## get list of currently available packages on CRAN
library(XML)
pkgs <- readHTMLTable(readLines(CRAN_page()),
which = 1, stringsAsFactors = FALSE)
## we love data.table
library(data.table)
setDT(pkgs)
## drop directories
pkgs <- pkgs[Size != '-']
## drop files that does not seem to be R packages
pkgs <- pkgs[grep('tar.gz$', Name)]
## package name should contain only (ASCII) letters, numbers and dot
pkgs[, name := sub('^([a-zA-Z0-9\\.]*).*', '\\1', Name)]
## grab date from last modified timestamp
pkgs[, date := as.POSIXct(`Last modified`, format = '%d-%b-%Y %H:%M')]
pkgs[, date := as.character(date)]
## keep date and name
pkgs <- pkgs[, .(name, date)]
## list of packages with at least one archived version
archives <- readHTMLTable(readLines(CRAN_page('Archive')),
which = 1, stringsAsFactors = FALSE)
setDT(archives)
## keep directories
archives <- archives[grep('/$', Name)]
## add packages not found in current list of R packages
archives[, Name := sub('/$', '', Name)]
pkgs <- rbind(pkgs,
archives[!Name %in% pkgs$name, .(name = Name)],
fill = TRUE)
## reorder pkg in alphabet order
setorder(pkgs, name)
## number of versions released is 1 for published packages
pkgs[, versions := 0]
pkgs[!is.na(date), versions := 1]
## mark archived pacakges
pkgs[, archived := FALSE]
pkgs[name %in% archives$Name, archived := TRUE]
## NA date of packages with archived versions
pkgs[archived == TRUE, date := NA]
## lookup release date of first version & number of releases
pkgs[is.na(date), c('date', 'versions') := {
cat(name, '\n')
## download archive page
page <- readLines(CRAN_page('Archive', name))
## extract date with regexp as HTML parsing can be slow :)
date <- sub('.*([0-9]{2}-[A-Za-z]{3}-[0-9]{4} [0-9]{2}:[0-9]{2}).*', '\\1', page[10])
## convert to YYYY-mm-dd format
date <- as.POSIXct(date, format = '%d-%b-%Y %H:%M')
## number of previous releases
archived_versions <- length(page) - 9 - 4
## return
list(as.character(date), versions + archived_versions)
}, by = name]
## rename cols
setnames(pkgs, 'date', 'first_release')
## order by date & alphabet
setorder(pkgs, first_release, name)
pkgs[, index := .I]
pkgs[c(250, 500, (1:9)*1000)]
## plot trend
library(ggplot2)
ggplot(pkgs, aes(as.Date(first_release), index)) +
geom_line(size = 1) +
scale_x_date(date_breaks = '2 year', date_labels = '%Y') +
scale_y_continuous(breaks = seq(0, 15000, 1000)) +
xlab('') + ylab('') + theme_bw() +
ggtitle('Number of R packages ever published on CRAN')
ggsave("CRAN_packages.png")
write.csv(pkgs, 'pkgs.csv', row.names = FALSE)