Obtain all cooccurrences in a corpus, or a partition
. The result is a
Cooccurrences
-class object which includes a data.table
with
counts of cooccurrences. See the documentation entry for the
Cooccurrences
-class for methods to process Cooccurrences
-class
objects.
# S4 method for corpus Cooccurrences( .Object, p_attribute, left, right, stoplist = NULL, mc = getOption("polmineR.mc"), verbose = FALSE, progress = FALSE ) # S4 method for character Cooccurrences( .Object, p_attribute, left, right, stoplist = NULL, mc = getOption("polmineR.mc"), verbose = FALSE, progress = FALSE ) # S4 method for slice Cooccurrences( .Object, p_attribute, left, right, stoplist = NULL, mc = getOption("polmineR.mc"), verbose = FALSE, progress = FALSE ) # S4 method for partition Cooccurrences( .Object, p_attribute, left, right, stoplist = NULL, mc = getOption("polmineR.mc"), verbose = FALSE, progress = FALSE ) # S4 method for subcorpus Cooccurrences( .Object, p_attribute, left, right, stoplist = NULL, mc = getOption("polmineR.mc"), verbose = FALSE, progress = FALSE )
.Object | A length-one character vector indicating a corpus, or a
|
---|---|
p_attribute | Positional attributes to evaluate. |
left | A scalar |
right | A scalar |
stoplist | Tokens to exclude from the analysis. |
mc | Logical value, whether to use multiple cores. |
verbose | Logical value, whether to output messages. |
progress | Logical value, whether to display a progress bar. |
The implementation uses a data.table
to store information and makes
heavy use of the reference logic of the data.table
package, to avoid
copying potentially large objects, and to be parsimonious with limited
memory. The behaviour resulting from in-place changes may be uncommon, see
examples.
To learn about methods available for the object that is returned,
see the documentation of the Cooccurrences-class
. See the
cooccurrences
-method (starting with a lower case c) to get
the cooccurrences for the match for a query, which may also be a CQP query.
if (FALSE) { # In a first scenario, we get all cooccurrences for the REUTERS corpus, # excluding stopwords stopwords <- unname(unlist( noise( terms("REUTERS", p_attribute = "word"), stopwordsLanguage = "en" ) )) r <- Cooccurrences( .Object = "REUTERS", p_attribute = "word", left = 5L, right = 5L, stoplist = stopwords ) ll(r) # note that the table in the stat slot is augmented in-place decode(r) # in-place modification, again r <- subset(r, ll > 11.83 & ab_count >= 5) data.table::setorderv(r@stat, cols = "ll", order = -1L) head(r, 25) if (requireNamespace("igraph", quietly = TRUE)){ r@partition <- enrich(r@partition, p_attribute = "word") g <- as_igraph(r, as.undirected = TRUE) plot(g) } # The next scenario is a cross-check that extracting cooccurrences from # from a Cooccurrences-class object with all cooccurrences and the result # for getting cooccurrences for a single object are identical a <- cooccurrences(r, query = "oil") a <- data.table::as.data.table(a) b <- cooccurrences("REUTERS", query = "oil", left = 5, right = 5, p_attribute = "word") b <- data.table::as.data.table(b) b <- b[!word %in% stopwords] all(b[["word"]][1:5] == a[["word"]][1:5]) # needs to be identical! stopwords <- unlist(noise( terms("GERMAPARLMINI", p_attribute = "word"), stopwordsLanguage = "german" ) ) # We now filter cooccurrences by keeping only the statistically # significant cooccurrens, identified by comparison with cooccurrences # derived from a reference corpus plpr_partition <- partition( "GERMAPARLMINI", date = "2009-11-10", interjection = "speech", p_attribute = "word" ) plpr_cooc <- Cooccurrences( plpr_partition, p_attribute = "word", left = 3L, right = 3L, stoplist = stopwords, verbose = TRUE ) decode(plpr_cooc) ll(plpr_cooc) merkel <- partition( "GERMAPARLMINI", speaker = "Merkel", date = "2009-11-10", interjection = "speech", regex = TRUE, p_attribute = "word" ) merkel_cooc <- Cooccurrences( merkel, p_attribute = "word", left = 3L, right = 3L, stoplist = stopwords, verbose = TRUE ) decode(merkel_cooc) ll(merkel_cooc) merkel_min <- subset( merkel_cooc, by = subset(features(merkel_cooc, plpr_cooc), rank_ll <= 50) ) # Esentially the same procedure as in the previous example, but with # two positional attributes, so that part-of-speech annotation is # used for additional filtering. protocol <- partition( "GERMAPARLMINI", date = "2009-11-10", p_attribute = c("word", "pos"), interjection = "speech" ) protocol_cooc <- Cooccurrences( protocol, p_attribute = c("word", "pos"), left = 3L, right = 3L ) ll(protocol_cooc) decode(protocol_cooc) merkel <- partition( "GERMAPARLMINI", speaker = "Merkel", date = "2009-11-10", interjection = "speech", regex = TRUE, p_attribute = c("word", "pos") ) merkel_cooc <- Cooccurrences( merkel, p_attribute = c("word", "pos"), left = 3L, right = 3L, verbose = TRUE ) ll(merkel_cooc) decode(merkel_cooc) f <- features(merkel_cooc, protocol_cooc) f <- subset(f, a_pos %in% c("NN", "ADJA")) f <- subset(f, b_pos %in% c("NN", "ADJA")) f <- subset(f, c(rep(TRUE, times = 50), rep(FALSE, times = nrow(f) - 50))) merkel_min <- subset(merkel_cooc, by = f) if (requireNamespace("igraph", quietly = TRUE)){ g <- as_igraph(merkel_min, as.undirected = TRUE) plot(g) } }