encode(x, ...)

# S4 method for data.frame
  s_attributes = NULL,
  encoding = "utf8",
  registry_dir = fs::path(tempdir(), "cwb_registry"),
  data_dir = fs::path(tempdir(), "cwb_data_dir", tolower(corpus)),
  properties = c(),
  method = c("R", "CWB"),
  verbose = TRUE,
  compress = FALSE,
  reload = TRUE,
  quietly = TRUE



A data.frame or an object inheriting from data.frame (such as tibble, data.table).


Further arguments (unused).


ID of the CWB corpus to create.


A list of data.frame objects with columns 'cpos_left' and 'cpos_right' and columns with s-attributes, the names of which will serve as names of s-attributes. It s_attributes is a data.frame, it will be coerced to a list.


Encoding as defined in the charset corpus property of the registry file for the corpus ('latin1' to 'latin9', and 'utf8').


Registry directory.


The data directory for the binary files of the corpus.


A named character vector with corpus properties that will be added to the registry file describing the corpus. Names of the vector indicate a property (such as "version") and the values of the vector the values of a corpus property.


Either 'CWB' or 'R', defaults to 'R'. See section 'Details'.


A logical value, whether to output progress messages.


A logical value, whether to run RcppCWB::cwb_huffcode() and RcppCWB::cwb_compress_rdx() (method 'R'), or command line tools cwb-huffcode and cwb-compress-rdx (method 'CWB'). Defaults to FALSE as compression is not stable on Windows.


A logical value, whether to reload the corpus to make it immediatedly available.


A logical value passed into RcppCWB::cwb_makeall(), RcppCWB::cwb_huffcode() and RcppCWB::cwb_compress_rdx to control verbosity of these functions.


# This is an example we run conditionally as packages are suggested.

dplyr_available <- requireNamespace("dplyr")
tidytext_available <- requireNamespace("tidytext")
#> Loading required namespace: tidytext
quanteda_available <- requireNamespace("quanteda")
#> Loading required namespace: quanteda

if (dplyr_available && tidytext_available && quanteda_available){

library(dplyr) # pipe would not be available otherwise

registry_tmp <- fs::path(tempdir(), "cwb_registry")

tidydata <- quanteda::data_char_ukimmig2010 %>%
   as.data.frame() %>%
   as_tibble(rownames = "party") %>%
   rename(`text` = ".")
tokenstream <- tidydata %>%
   unnest_tokens(word, text, to_lower = FALSE, strip_punct = FALSE) %>%
   mutate(cpos = 0L:(nrow(.) - 1L))
metadata <- tokenstream %>% 
  group_by(party) %>% 
  summarise(cpos_left = min(cpos), cpos_right = max(cpos))

tokenstream %>%
  select(-cpos, -party) %>%
    corpus = "UKIMMIG2010",
    s_attributes = metadata,
    properties = c(lang = "en")
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:data.table’:
#>     between, first, last
#> The following objects are masked from ‘package:stats’:
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#>     intersect, setdiff, setequal, union
#> ── Prepare encoding corpus UKIMMIG2010 ─────────────────────────────────────────
#>  registry directory: /tmp/RtmpeQEeXz/cwb_registry
#>  data directory: /tmp/RtmpeQEeXz/cwb_data_dir/ukimmig2010
#>  encoding: "utf8"
#>  method for encoding, indexing and compression: "R"
#> ── encode p-attribute "word" ───────────────────────────────────────────────────
#>  creating indices (in memory)
#>  creating indices (in memory) [201ms]
#>  writing file: word.corpus
#>  writing file: word.corpus [544ms]
#>  writing file: word.lexicon
#>  writing file: word.lexicon [18ms]
#>  writing file: word.lexicon.idx
#>  writing file: word.lexicon.idx [17ms]
#>  creating new registry file: /tmp/RtmpeQEeXz/cwb_registry/ukimmig2010
#>  run `Rcpp::cwb_makeall()`
#>  run `Rcpp::cwb_makeall()` [7ms]
#>  corpus reloaded: CL success / CQP success
#> ── Encode s-attributes ─────────────────────────────────────────────────────────
#>  encode s-attribute "party" (9 regions)
#>  add s-attribute "party" to registry
#> ── Prepare and augment registry file ───────────────────────────────────────────
#>  using corpus properties: charset = utf8 // lang = en
#>  writing registry file
#>  writing registry file [5ms]
#> ── Check result ────────────────────────────────────────────────────────────────
#>  corpus reloaded: CL success / CQP success
#>  all p-attributes are available
#>  all s-attributes are available