[Experimental]

encode(x, ...)

# S4 method for data.frame
encode(
  x,
  corpus,
  s_attributes = NULL,
  encoding = "utf8",
  registry_dir = fs::path(tempdir(), "cwb_registry"),
  data_dir = fs::path(tempdir(), "cwb_data_dir", tolower(corpus)),
  properties = c(),
  method = c("R", "CWB"),
  verbose = TRUE,
  compress = FALSE,
  reload = TRUE,
  quietly = TRUE
)

Arguments

x

A data.frame or an object inheriting from data.frame (such as tibble, data.table).

...

Further arguments (unused).

corpus

ID of the CWB corpus to create.

s_attributes

A list of data.frame objects with columns 'cpos_left' and 'cpos_right' and columns with s-attributes, the names of which will serve as names of s-attributes. It s_attributes is a data.frame, it will be coerced to a list.

encoding

Encoding as defined in the charset corpus property of the registry file for the corpus ('latin1' to 'latin9', and 'utf8').

registry_dir

Registry directory.

data_dir

The data directory for the binary files of the corpus.

properties

A named character vector with corpus properties that will be added to the registry file describing the corpus. Names of the vector indicate a property (such as "version") and the values of the vector the values of a corpus property.

method

Either 'CWB' or 'R', defaults to 'R'. See section 'Details'.

verbose

A logical value, whether to output progress messages.

compress

A logical value, whether to run RcppCWB::cwb_huffcode() and RcppCWB::cwb_compress_rdx() (method 'R'), or command line tools cwb-huffcode and cwb-compress-rdx (method 'CWB'). Defaults to FALSE as compression is not stable on Windows.

reload

A logical value, whether to reload the corpus to make it immediatedly available.

quietly

A logical value passed into RcppCWB::cwb_makeall(), RcppCWB::cwb_huffcode() and RcppCWB::cwb_compress_rdx to control verbosity of these functions.

Examples

# This is an example we run conditionally as packages are suggested.

dplyr_available <- requireNamespace("dplyr")
tidytext_available <- requireNamespace("tidytext")
#> Loading required namespace: tidytext
quanteda_available <- requireNamespace("quanteda")
#> Loading required namespace: quanteda

if (dplyr_available && tidytext_available && quanteda_available){

library(dplyr) # pipe would not be available otherwise
library(tidytext)

registry_tmp <- fs::path(tempdir(), "cwb_registry")
dir.create(registry_tmp)

tidydata <- quanteda::data_char_ukimmig2010 %>%
   as.data.frame() %>%
   as_tibble(rownames = "party") %>%
   rename(`text` = ".")
   
tokenstream <- tidydata %>%
   unnest_tokens(word, text, to_lower = FALSE, strip_punct = FALSE) %>%
   mutate(cpos = 0L:(nrow(.) - 1L))
   
metadata <- tokenstream %>% 
  group_by(party) %>% 
  summarise(cpos_left = min(cpos), cpos_right = max(cpos))

tokenstream %>%
  select(-cpos, -party) %>%
  encode(
    corpus = "UKIMMIG2010",
    s_attributes = metadata,
    properties = c(lang = "en")
  )
  
}
#> 
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:data.table’:
#> 
#>     between, first, last
#> The following objects are masked from ‘package:stats’:
#> 
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#> 
#>     intersect, setdiff, setequal, union
#> ── Prepare encoding corpus UKIMMIG2010 ─────────────────────────────────────────
#>  registry directory: /tmp/RtmpeQEeXz/cwb_registry
#>  data directory: /tmp/RtmpeQEeXz/cwb_data_dir/ukimmig2010
#>  encoding: "utf8"
#>  method for encoding, indexing and compression: "R"
#> ── encode p-attribute "word" ───────────────────────────────────────────────────
#>  creating indices (in memory)
#>  creating indices (in memory) [201ms]
#> 
#>  writing file: word.corpus
#>  writing file: word.corpus [544ms]
#> 
#>  writing file: word.lexicon
#>  writing file: word.lexicon [18ms]
#> 
#>  writing file: word.lexicon.idx
#>  writing file: word.lexicon.idx [17ms]
#> 
#>  creating new registry file: /tmp/RtmpeQEeXz/cwb_registry/ukimmig2010
#>  run `Rcpp::cwb_makeall()`
#>  run `Rcpp::cwb_makeall()` [7ms]
#> 
#>  corpus reloaded: CL success / CQP success
#> ── Encode s-attributes ─────────────────────────────────────────────────────────
#>  encode s-attribute "party" (9 regions)
#>  add s-attribute "party" to registry
#> ── Prepare and augment registry file ───────────────────────────────────────────
#>  using corpus properties: charset = utf8 // lang = en
#>  writing registry file
#>  writing registry file [5ms]
#> 
#> ── Check result ────────────────────────────────────────────────────────────────
#>  corpus reloaded: CL success / CQP success
#>  all p-attributes are available
#>  all s-attributes are available