Get cooccurrence statistics.
cooccurrences(.Object, ...) # S4 method for corpus cooccurrences( .Object, query, cqp = is.cqp, p_attribute = getOption("polmineR.p_attribute"), boundary = NULL, left = getOption("polmineR.left"), right = getOption("polmineR.right"), stoplist = NULL, positivelist = NULL, regex = FALSE, keep = NULL, cpos = NULL, method = "ll", mc = getOption("polmineR.mc"), verbose = FALSE, progress = FALSE, ... ) # S4 method for character cooccurrences( .Object, query, cqp = is.cqp, p_attribute = getOption("polmineR.p_attribute"), boundary = NULL, left = getOption("polmineR.left"), right = getOption("polmineR.right"), stoplist = NULL, positivelist = NULL, regex = FALSE, keep = NULL, cpos = NULL, method = "ll", mc = getOption("polmineR.mc"), verbose = FALSE, progress = FALSE, ... ) # S4 method for slice cooccurrences( .Object, query, cqp = is.cqp, left = getOption("polmineR.left"), right = getOption("polmineR.right"), p_attribute = getOption("polmineR.p_attribute"), boundary = NULL, stoplist = NULL, positivelist = NULL, keep = NULL, method = "ll", mc = FALSE, progress = TRUE, verbose = FALSE, ... ) # S4 method for partition cooccurrences( .Object, query, cqp = is.cqp, left = getOption("polmineR.left"), right = getOption("polmineR.right"), p_attribute = getOption("polmineR.p_attribute"), boundary = NULL, stoplist = NULL, positivelist = NULL, keep = NULL, method = "ll", mc = FALSE, progress = TRUE, verbose = FALSE, ... ) # S4 method for subcorpus cooccurrences( .Object, query, cqp = is.cqp, left = getOption("polmineR.left"), right = getOption("polmineR.right"), p_attribute = getOption("polmineR.p_attribute"), boundary = NULL, stoplist = NULL, positivelist = NULL, keep = NULL, method = "ll", mc = FALSE, progress = TRUE, verbose = FALSE, ... ) # S4 method for context cooccurrences(.Object, method = "ll", verbose = FALSE) # S4 method for partition_bundle cooccurrences(.Object, query, mc = getOption("polmineR.mc"), ...) # S4 method for Cooccurrences cooccurrences(.Object, query) # S4 method for remote_corpus cooccurrences(.Object, ...) # S4 method for remote_subcorpus cooccurrences(.Object, ...)
.Object | A |
---|---|
... | Further parameters that will be passed into bigmatrix (applies only of big = TRUE). |
query | A query, either a character vector to match a token, or a CQP query. |
cqp | Defaults to |
p_attribute | The p-attribute of the tokens/the query. |
boundary | If provided, it will be checked that the corpus positions of windows do not extend beyond the left and right boundaries of the region defined by the s-attribute where the match occurs. |
left | Number of tokens to the left of the query match. |
right | Number of tokens to the right of the query match. |
stoplist | Exclude a query hit from analysis if stopword(s) is/are in
context (relevant only if query is not |
positivelist | Character vector or numeric vector: include a query hit
only if token in |
regex | A |
keep | list with tokens to keep |
cpos | integer vector with corpus positions, defaults to NULL - then the corpus positions for the whole corpus will be used |
method | The statistical test(s) to use (defaults to "ll"). |
mc | whether to use multicore |
verbose | A |
progress | A |
a cooccurrences-class object
Baker, Paul (2006): Using Corpora in Discourse Analysis. London: continuum, p. 95-120 (ch. 5).
Manning, Christopher D.; Schuetze, Hinrich (1999): Foundations of Statistical Natural Language Processing. MIT Press: Cambridge, Mass., pp. 151-189 (ch. 5).
See the documentation for the ll
-method for an
explanation of the computation of the log-likelihood statistic.
Andreas Blaette
#>#>#>#>#>cooc <- cooccurrences(merkel, query = "Deutschland") # use subset-method to filter results a <- cooccurrences("REUTERS", query = "oil") b <- subset(a, !is.na(ll)) c <- subset(b, !word %in% tm::stopwords("en")) d <- subset(c, count_coi >= 5) e <- subset(c, ll >= 10.83) format(e)#> word count_coi count_ref exp_coi exp_ref ll rank_ll #> 1: prices 27 20 9.23 37.77 33.05 1 #> 2: crude 13 7 3.93 16.07 19.62 2 #> 3: industry 8 2 1.96 8.04 16.97 3 #> 4: recent 5 1 1.18 4.82 11.33 4# using pipe operator may be convenient if (require(magrittr)){ cooccurrences("REUTERS", query = "oil") %>% subset(!is.na(ll)) %>% subset(!word %in% tm::stopwords("en")) %>% subset(count_coi >= 5) %>% subset(ll >= 10.83) %>% format() }#>#> word count_coi count_ref exp_coi exp_ref ll rank_ll #> 1: prices 27 20 9.23 37.77 33.05 1 #> 2: crude 13 7 3.93 16.07 19.62 2 #> 3: industry 8 2 1.96 8.04 16.97 3 #> 4: recent 5 1 1.18 4.82 11.33 4pb <- partition_bundle("GERMAPARLMINI", s_attribute = "speaker") pb_min <- pb[[ count(pb, query = "Deutschland")[Deutschland >= 25][["partition"]] ]] y <- cooccurrences(pb_min, query = "Deutschland") if (interactive()) y[[1]] if (interactive()) y[[2]] y2 <- corpus("GERMAPARLMINI") %>% subset(speaker %in% c("Hubertus Heil", "Angela Dorothea Merkel")) %>% split(s_attribute = "speaker") %>% cooccurrences(query = "Deutschland")