-
Notifications
You must be signed in to change notification settings - Fork 0
/
README.Rmd
151 lines (110 loc) · 4.99 KB
/
README.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
---
output: github_document
---
<!-- README.md is generated from README.Rmd. Please edit that file -->
```{r, include = FALSE}
knitr::knit_hooks$set(pngquant = knitr::hook_pngquant)
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.path = "man/figures/README-",
out.width = "100%",
echo = FALSE,
message = FALSE,
dev = "ragg_png",
pngquant = "--speed=1 --quality=50"
)
```
# oneclust <img src="man/figures/logo.png" align="right" width="120" />
<!-- badges: start -->
[![R-CMD-check](https://github.com/nanxstats/oneclust/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/nanxstats/oneclust/actions/workflows/R-CMD-check.yaml)
[![CRAN Version](https://www.r-pkg.org/badges/version/oneclust)](https://cran.r-project.org/package=oneclust)
[![Downloads from the RStudio CRAN mirror](https://cranlogs.r-pkg.org/badges/oneclust)](https://cran.r-project.org/package=oneclust)
<!-- badges: end -->
Implements the maximum homogeneity clustering algorithm for one-dimensional data described in W. D. Fisher (1958) <[doi:10.1080/01621459.1958.10501479](https://www.tandfonline.com/doi/abs/10.1080/01621459.1958.10501479)> via dynamic programming.
Check `vignette("oneclust")` for its applications in feature engineering, regression modeling, and peak calling.
## Installation
You can install oneclust from CRAN:
```r
install.packages("oneclust")
```
Or try the development version from GitHub:
```r
remotes::install_github("nanxstats/oneclust")
```
## Gallery
```{r}
library("oneclust")
```
### Feature engineering for high-cardinality categorical features
```{r, high-cardinality, fig.width=10, fig.height=5, dpi=227}
df_levels <- sim_postcode_levels(nlevels = 500, seed = 42)
train <- sim_postcode_samples(df_levels, n = 100000, threshold = 3000, prob = c(0.2, 0.1), seed = 43)
test <- sim_postcode_samples(df_levels, n = 100000, threshold = 3000, prob = c(0.2, 0.1), seed = 44)
k <- 32
level_hist <- table(train$postcode)
level_new <- oneclust(level_hist, k)$cluster
feature_tr_levels <- as.character(1:k)
feature_tr <- as.character(level_new[match(train$postcode, names(level_hist))])
feature_tr <- ordered(feature_tr, levels = feature_tr_levels)
feature_te <- as.character(level_new[match(test$postcode, names(level_hist))])
feature_te <- ordered(feature_te, levels = feature_tr_levels)
par(mfrow = c(1, 2))
par(las = 1)
plot(feature_tr, train$label, lty = 0, xlab = "Cluster", ylab = "Label", main = "Recoded Training Set")
abline(h = 0.2, col = cud(1))
abline(h = 0.1, col = cud(2))
par(las = 1)
plot(feature_te, test$label, lty = 0, xlab = "Cluster", ylab = "Label", main = "Recoded Test Set")
abline(h = 0.2, col = cud(1))
abline(h = 0.1, col = cud(2))
```
### Grouping coefficients in regression models
```{r, coefficients, fig.width=8, fig.height=8, dpi=227}
par(mfrow = c(2, 2))
set.seed(42)
n <- 100
i <- 1:n
y <- (i > 20 & i < 30) + 5 * (i > 50 & i < 70) + rnorm(n, sd = 0.1)
out <- genlasso::fusedlasso1d(y)
beta1 <- coef(out, lambda = 1.5)$beta
plot(beta1, main = "Raw Estimates")
abline(h = 0)
beta2 <- genlasso::softthresh(out, lambda = 1.5, gamma = 1)
grp <- as.integer(beta2 != 0) + 1L
plot(beta2, col = cud(grp), main = "Soft-Thresholding")
abline(h = 0)
legend("topleft", legend = c("Zero", "Non-zero"), col = cud(unique(grp)), pch = 1)
cl1 <- oneclust(beta1, k = 2)$cluster
plot(beta1, col = cud(cl1), main = "Maximum Homogeneity Clustering (k = 2)")
abline(h = 0)
legend("topleft", legend = paste("Cluster", unique(cl1)), col = cud(unique(cl1)), pch = 1)
cl2 <- oneclust(beta1, k = 3)$cluster
plot(beta1, col = cud(cl2), main = "Maximum Homogeneity Clustering (k = 3)")
abline(h = 0)
legend("topleft", legend = paste("Cluster", unique(cl2)), col = cud(unique(cl2)), pch = 1)
```
### Sequential data peak calling and segmentation
```{r, peak-calling, fig.width=7.5, fig.height=10, dpi=227}
x <- seq(0, 1, len = 1024)
pos <- c(0.1, 0.13, 0.15, 0.23, 0.25, 0.40, 0.44, 0.65, 0.76, 0.78, 0.81)
hgt <- c(4, 5, 3, 4, 5, 4.2, 2.1, 4.3, 3.1, 5.1, 4.2)
wdt <- c(0.005, 0.005, 0.006, 0.01, 0.01, 0.03, 0.01, 0.01, 0.005, 0.008, 0.005)
psignal <- numeric(length(x))
for (i in seq(along = pos)) {
psignal <- psignal + hgt[i] / (1 + abs((x - pos[i]) / wdt[i]))^4
}
par(mfrow = c(4, 1))
plot(psignal, type = "l", main = "Raw Signal")
cl <- oneclust(psignal, k = 2)
plot(psignal, type = "h", col = cud(cl$cluster), main = "Peak Calling (k = 2)")
legend("topright", legend = paste("Cluster", unique(cl$cluster)), col = cud(unique(cl$cluster)), lty = 1)
cl <- oneclust(psignal, k = 4)
plot(psignal, type = "h", col = cud(cl$cluster + 2), main = "Peak Calling (k = 4)")
legend("topright", legend = paste("Cluster", unique(cl$cluster)), col = cud(unique(cl$cluster + 2)), lty = 1)
cl <- oneclust(psignal, k = 6, sort = FALSE)
plot(psignal, type = "h", col = cud(cl$cluster), main = "Segmentation - Preserving Data Order (k = 6)")
legend("topright", legend = paste("Cluster", unique(cl$cluster)), col = cud(unique(cl$cluster)), lty = 1, cex = 0.8)
```
## License
oneclust is free and open source software, licensed under GPL-3.