encode(x, ...)
# S4 method for data.frame
encode(
x,
corpus,
s_attributes = NULL,
encoding = "utf8",
registry_dir = fs::path(tempdir(), "cwb_registry"),
data_dir = fs::path(tempdir(), "cwb_data_dir", tolower(corpus)),
properties = c(),
method = c("R", "CWB"),
verbose = TRUE,
compress = FALSE,
reload = TRUE,
quietly = TRUE
)
A data.frame
or an object inheriting from data.frame
(such as
tibble
, data.table
).
Further arguments (unused).
ID of the CWB corpus to create.
A list
of data.frame
objects with columns 'cpos_left'
and 'cpos_right' and columns with s-attributes, the names of which will
serve as names of s-attributes. It s_attributes
is a data.frame
, it will
be coerced to a list
.
Encoding as defined in the charset corpus property of the registry file for the corpus ('latin1' to 'latin9', and 'utf8').
Registry directory.
The data directory for the binary files of the corpus.
A named character
vector with corpus properties that will
be added to the registry file describing the corpus. Names of the vector
indicate a property (such as "version") and the values of the vector the
values of a corpus property.
Either 'CWB' or 'R', defaults to 'R'. See section 'Details'.
A logical
value, whether to output progress messages.
A logical
value, whether to run RcppCWB::cwb_huffcode()
and RcppCWB::cwb_compress_rdx()
(method 'R'), or command line tools
cwb-huffcode
and cwb-compress-rdx
(method 'CWB'). Defaults to FALSE
as compression is not stable on Windows.
A logical value, whether to reload the corpus to make it immediatedly available.
A logical
value passed into RcppCWB::cwb_makeall()
,
RcppCWB::cwb_huffcode()
and RcppCWB::cwb_compress_rdx
to control
verbosity of these functions.
# This is an example we run conditionally as packages are suggested.
dplyr_available <- requireNamespace("dplyr")
tidytext_available <- requireNamespace("tidytext")
#> Loading required namespace: tidytext
quanteda_available <- requireNamespace("quanteda")
#> Loading required namespace: quanteda
if (dplyr_available && tidytext_available && quanteda_available){
library(dplyr) # pipe would not be available otherwise
library(tidytext)
registry_tmp <- fs::path(tempdir(), "cwb_registry")
dir.create(registry_tmp)
tidydata <- quanteda::data_char_ukimmig2010 %>%
as.data.frame() %>%
as_tibble(rownames = "party") %>%
rename(`text` = ".")
tokenstream <- tidydata %>%
unnest_tokens(word, text, to_lower = FALSE, strip_punct = FALSE) %>%
mutate(cpos = 0L:(nrow(.) - 1L))
metadata <- tokenstream %>%
group_by(party) %>%
summarise(cpos_left = min(cpos), cpos_right = max(cpos))
tokenstream %>%
select(-cpos, -party) %>%
encode(
corpus = "UKIMMIG2010",
s_attributes = metadata,
properties = c(lang = "en")
)
}
#>
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:data.table’:
#>
#> between, first, last
#> The following objects are masked from ‘package:stats’:
#>
#> filter, lag
#> The following objects are masked from ‘package:base’:
#>
#> intersect, setdiff, setequal, union
#> ── Prepare encoding corpus UKIMMIG2010 ─────────────────────────────────────────
#> ℹ registry directory: /tmp/RtmpeQEeXz/cwb_registry
#> ℹ data directory: /tmp/RtmpeQEeXz/cwb_data_dir/ukimmig2010
#> ℹ encoding: "utf8"
#> ℹ method for encoding, indexing and compression: "R"
#> ── encode p-attribute "word" ───────────────────────────────────────────────────
#> ℹ creating indices (in memory)
#> ✔ creating indices (in memory) [201ms]
#>
#> ℹ writing file: word.corpus
#> ✔ writing file: word.corpus [544ms]
#>
#> ℹ writing file: word.lexicon
#> ✔ writing file: word.lexicon [18ms]
#>
#> ℹ writing file: word.lexicon.idx
#> ✔ writing file: word.lexicon.idx [17ms]
#>
#> ℹ creating new registry file: /tmp/RtmpeQEeXz/cwb_registry/ukimmig2010
#> ℹ run `Rcpp::cwb_makeall()`
#> ✔ run `Rcpp::cwb_makeall()` [7ms]
#>
#> ✔ corpus reloaded: CL success / CQP success
#> ── Encode s-attributes ─────────────────────────────────────────────────────────
#> ℹ encode s-attribute "party" (9 regions)
#> ℹ add s-attribute "party" to registry
#> ── Prepare and augment registry file ───────────────────────────────────────────
#> ℹ using corpus properties: charset = utf8 // lang = en
#> ℹ writing registry file
#> ✔ writing registry file [5ms]
#>
#> ── Check result ────────────────────────────────────────────────────────────────
#> ✔ corpus reloaded: CL success / CQP success
#> ✔ all p-attributes are available
#> ✔ all s-attributes are available