Wrappers for the CWB tools (cwb-makeall
, cwb-huffcode
,
cwb-compress-rdx
). Unlike the 'original' command line tools, these
wrappers will always perform a specific indexing/compression step on one
positional attribute, and produce all components.
cwb_makeall(
corpus,
p_attribute,
registry = Sys.getenv("CORPUS_REGISTRY"),
quietly = FALSE
)
cwb_huffcode(
corpus,
p_attribute,
registry = Sys.getenv("CORPUS_REGISTRY"),
quietly = FALSE,
delete = TRUE
)
cwb_compress_rdx(
corpus,
p_attribute,
registry = Sys.getenv("CORPUS_REGISTRY"),
quietly = FALSE,
delete = TRUE
)
cwb_encode(
corpus,
registry = Sys.getenv("CORPUS_REGISTRY"),
data_dir,
vrt_dir,
encoding = "utf8",
p_attributes = c("word", "pos", "lemma"),
s_attributes,
skip_blank_lines = TRUE,
strip_whitespace = TRUE,
xml = TRUE,
quietly = FALSE,
verbose = FALSE
)
name of a CWB corpus (upper case)
name p-attribute
path to the registry directory, defaults to the value of the environment variable CORPUS_REGISTRY
A logical
value, whether to turn off messages (including
warnings).
A logical
value, whether to remove redundant files after
compression.
The data directory where cwb_encode
will save the binary
files of the indexed corpus. Tilde expansion is performed on data_dir
using path.expand()
to avoid a crash.
Directory with input corpus files (verticalised format / file
ending *.vrt). Tilde expansion is performed on vrt_dir
using
path.expand()
to avoid a crash.
The encoding of the files to be encoded. Needs to be an
encoding supported by CWB, see cwb_charsets()
. "UTF-8" is taken as
"utf8". Defaults to "utf8" (recommended charset).
Positional attributes (p-attributes) to be declared.
A list
of named character
vectors to declare
structural attributes that shall be encoded. The names of the list are the
XML elements present in the corpus. Character vectors making up the list
declare the attributes that include the metadata of regions. To declare a
structural attribute without annotations, provide a zero-length character
vector using character()
- see examples.
A logical
value, whether to skip blank lines in the
input.
A logical
value, whether to strip whitespace from
tokens
A logical
value, whether input is XML.
A logical
value, whether to show progress information
(counter of tokens processed).
# The package includes and 'unfinished' corpus of debates in the UN General
# Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it is
# not compressed.
#
# The first step in the following example is to copy the raw
# corpus to a temporary place.
home_dir <- system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "unga")
tmp_data_dir <- file.path(tempdir(), "indexed_corpora")
tmp_unga_dir <- file.path(tmp_data_dir, "unga2")
if (!file.exists(tmp_data_dir)) dir.create(tmp_data_dir)
if (!file.exists(tmp_unga_dir)){
dir.create(tmp_unga_dir)
} else {
file.remove(list.files(tmp_unga_dir, full.names = TRUE))
}
regfile <- readLines(
system.file(package = "RcppCWB", "extdata", "cwb", "registry", "unga")
)
regfile[grep("^HOME", regfile)] <- sprintf('HOME "%s"', tmp_unga_dir)
regfile[grep("^ID", regfile)] <- "ID unga2"
writeLines(text = regfile, con = file.path(get_tmp_registry(), "unga2"))
for (x in list.files(home_dir, full.names = TRUE)){
file.copy(from = x, to = tmp_unga_dir)
}
# perform cwb_makeall (equivalent to cwb-makeall command line utility)
cwb_makeall(corpus = "UNGA2", p_attribute = "word", registry = get_tmp_registry())
#> === Makeall: processing corpus UNGA2 ===
#> Registry directory: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/Rtmpk6lOF7/registry_tmp
#> ATTRIBUTE word
#> + creating LEXSRT ... OK
#> - lexicon OK
#> + creating FREQS ... OK
#> - frequencies OK
#> - token stream OK
#> + creating REVCIDX ... OK
#> + creating REVCORP ... OK
#> ? validating REVCORP ... OK
#> - index OK
#> ========================================
#> [1] 0
cl_load_corpus("UNGA2", registry = get_tmp_registry())
#> [1] TRUE
cqp_load_corpus("UNGA2", registry = get_tmp_registry())
#> [1] TRUE
# see whether it works
ids_sentence_1 <- cl_cpos2id(
corpus = "UNGA2", p_attribute = "word", registry = get_tmp_registry(),
cpos = 0:83
)
tokens_sentence_1 <- cl_id2str(
corpus = "UNGA2", p_attribute = "word",
registry = get_tmp_registry(), id = ids_sentence_1
)
sentence <- gsub("\\s+([\\.,])", "\\1", paste(tokens_sentence_1, collapse = " "))
# perform cwb_huffcode (equivalent to cwb-makeall command line utility)
cwb_huffcode(
corpus = "UNGA2",
p_attribute = "word",
registry = get_tmp_registry()
)
#> COMPRESSING TOKEN STREAM of (null).word
#> - writing code descriptor block to /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.hcd
#> - writing compressed item sequence to /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.huf
#> - writing sync (every 128 tokens) to /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.huf.syn
#> VALIDATING UNGA2.word
#> - reading code descriptor block from /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.hcd
#> - reading compressed item sequence from /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.huf
#> - reading sync (mod 128) from /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.huf.syn
#> !! You can delete the file </var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.corpus> now.
#> redundant file deleted: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/Rtmpk6lOF7/indexed_corpora/unga2/word.corpus
#> [1] 0
cwb_compress_rdx(
corpus = "UNGA2",
p_attribute = "word",
registry = get_tmp_registry()
)
#> COMPRESSING INDEX of UNGA2.word
#> - writing compressed index to /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.crc
#> - writing compressed index offsets to /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.crx
#> VALIDATING UNGA2.word
#> - reading compressed index from /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.crc
#> - reading compressed index offsets from /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.crx
#> !! You can delete the file </var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.corpus.rev> now.
#> !! You can delete the file </var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T//Rtmpk6lOF7/indexed_corpora/unga2/word.corpus.rdx> now.
#> redundant file deleted: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/Rtmpk6lOF7/indexed_corpora/unga2/word.corpus.rev
#> redundant file deleted: /var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/Rtmpk6lOF7/indexed_corpora/unga2/word.corpus.rdx
#> [1] 0
data_dir <- file.path(tempdir(), "bt_data_dir")
dir.create(data_dir)
cwb_encode(
corpus = "BTMIN",
registry = Sys.getenv("CORPUS_REGISTRY"),
vrt_dir = system.file(package = "RcppCWB", "extdata", "vrt"),
data_dir = data_dir,
p_attributes = c("word", "pos", "lemma"),
s_attributes = list(
plenary_protocol = c(
"lp", "protocol_no", "date", "year", "birthday", "version",
"url", "filetype"
),
speaker = c(
"id", "type", "lp", "protocol_no", "date", "year", "ai_no", "ai_id",
"ai_type", "who", "name", "parliamentary_group", "party", "role"
),
p = character()
)
)
#> [1] 0
unlink(data_dir)
unlink(file.path(Sys.getenv("CORPUS_REGISTRY"), "btmin"))