Encode CWB Corpus. — encode • cwbtools

encode(x, ...)

# S4 method for data.frame
encode(
  x,
  corpus,
  s_attributes = NULL,
  encoding = "utf8",
  registry_dir = fs::path(tempdir(), "cwb_registry"),
  data_dir = fs::path(tempdir(), "cwb_data_dir", tolower(corpus)),
  properties = c(),
  method = c("R", "CWB"),
  verbose = TRUE,
  compress = FALSE,
  reload = TRUE,
  quietly = TRUE
)

Arguments

x: A data.frame or an object inheriting from data.frame (such as tibble, data.table).
...: Further arguments (unused).
corpus: ID of the CWB corpus to create.
s_attributes: A list of data.frame objects with columns 'cpos_left' and 'cpos_right' and columns with s-attributes, the names of which will serve as names of s-attributes. It s_attributes is a data.frame, it will be coerced to a list.
encoding: Encoding as defined in the charset corpus property of the registry file for the corpus ('latin1' to 'latin9', and 'utf8').
registry_dir: Registry directory.
data_dir: The data directory for the binary files of the corpus.
properties: A named character vector with corpus properties that will be added to the registry file describing the corpus. Names of the vector indicate a property (such as "version") and the values of the vector the values of a corpus property.
method: Either 'CWB' or 'R', defaults to 'R'. See section 'Details'.
verbose: A logical value, whether to output progress messages.
compress: A logical value, whether to run RcppCWB::cwb_huffcode() and RcppCWB::cwb_compress_rdx() (method 'R'), or command line tools cwb-huffcode and cwb-compress-rdx (method 'CWB'). Defaults to FALSE as compression is not stable on Windows.
reload: A logical value, whether to reload the corpus to make it immediatedly available.
quietly: A logical value passed into RcppCWB::cwb_makeall(), RcppCWB::cwb_huffcode() and RcppCWB::cwb_compress_rdx to control verbosity of these functions.

Examples

# This is an example we run conditionally as packages are suggested.

dplyr_available <- requireNamespace("dplyr")
tidytext_available <- requireNamespace("tidytext")
#> Loading required namespace: tidytext
quanteda_available <- requireNamespace("quanteda")
#> Loading required namespace: quanteda

if (dplyr_available && tidytext_available && quanteda_available){

library(dplyr) # pipe would not be available otherwise
library(tidytext)

registry_tmp <- fs::path(tempdir(), "cwb_registry")
dir.create(registry_tmp)

tidydata <- quanteda::data_char_ukimmig2010 %>%
   as.data.frame() %>%
   as_tibble(rownames = "party") %>%
   rename(`text` = ".")
   
tokenstream <- tidydata %>%
   unnest_tokens(word, text, to_lower = FALSE, strip_punct = FALSE) %>%
   mutate(cpos = 0L:(nrow(.) - 1L))
   
metadata <- tokenstream %>% 
  group_by(party) %>% 
  summarise(cpos_left = min(cpos), cpos_right = max(cpos))

tokenstream %>%
  select(-cpos, -party) %>%
  encode(
    corpus = "UKIMMIG2010",
    s_attributes = metadata,
    properties = c(lang = "en")
  )
  
}
#> 
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:data.table’:
#> 
#>     between, first, last
#> The following objects are masked from ‘package:stats’:
#> 
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#> 
#>     intersect, setdiff, setequal, union
#> ── Prepare encoding corpus UKIMMIG2010 ─────────────────────────────────────────
#> ℹ registry directory: /tmp/RtmpeQEeXz/cwb_registry
#> ℹ data directory: /tmp/RtmpeQEeXz/cwb_data_dir/ukimmig2010
#> ℹ encoding: "utf8"
#> ℹ method for encoding, indexing and compression: "R"
#> ── encode p-attribute "word" ───────────────────────────────────────────────────
#> ℹ creating indices (in memory)
#> ✔ creating indices (in memory) [201ms]
#> 
#> ℹ writing file: word.corpus
#> ✔ writing file: word.corpus [544ms]
#> 
#> ℹ writing file: word.lexicon
#> ✔ writing file: word.lexicon [18ms]
#> 
#> ℹ writing file: word.lexicon.idx
#> ✔ writing file: word.lexicon.idx [17ms]
#> 
#> ℹ creating new registry file: /tmp/RtmpeQEeXz/cwb_registry/ukimmig2010
#> ℹ run `Rcpp::cwb_makeall()`
#> ✔ run `Rcpp::cwb_makeall()` [7ms]
#> 
#> ✔ corpus reloaded: CL success / CQP success
#> ── Encode s-attributes ─────────────────────────────────────────────────────────
#> ℹ encode s-attribute "party" (9 regions)
#> ℹ add s-attribute "party" to registry
#> ── Prepare and augment registry file ───────────────────────────────────────────
#> ℹ using corpus properties: charset = utf8 // lang = en
#> ℹ writing registry file
#> ✔ writing registry file [5ms]
#> 
#> ── Check result ────────────────────────────────────────────────────────────────
#> ✔ corpus reloaded: CL success / CQP success
#> ✔ all p-attributes are available
#> ✔ all s-attributes are available