Get N-Grams — ngrams • polmineR

Count n-grams, either of words, or of characters.

ngrams(.Object, ...)

# S4 method for partition
ngrams(
  .Object,
  n = 2,
  p_attribute = "word",
  char = NULL,
  progress = FALSE,
  ...
)

# S4 method for character
ngrams(
  .Object,
  n = 2,
  p_attribute = "word",
  char = NULL,
  progress = FALSE,
  ...
)

# S4 method for partition
ngrams(
  .Object,
  n = 2,
  p_attribute = "word",
  char = NULL,
  progress = FALSE,
  ...
)

# S4 method for subcorpus
ngrams(
  .Object,
  n = 2,
  p_attribute = "word",
  char = NULL,
  progress = FALSE,
  ...
)

# S4 method for character
ngrams(
  .Object,
  n = 2,
  p_attribute = "word",
  char = NULL,
  progress = FALSE,
  ...
)

# S4 method for data.table
ngrams(.Object, n = 2L, p_attribute = "word")

# S4 method for corpus
ngrams(
  .Object,
  n = 2,
  p_attribute = "word",
  char = NULL,
  progress = FALSE,
  ...
)

# S4 method for partition_bundle
ngrams(
  .Object,
  n = 2,
  char = NULL,
  p_attribute = "word",
  mc = FALSE,
  progress = FALSE,
  ...
)

Arguments

.Object	object of class `partition`
...	Further arguments.
n	number of tokens/characters
p_attribute	the p-attribute to use (can be > 1)
char	If `NULL`, tokens will be counted, else characters, keeping only those provided by a character vector
progress	logical
mc	A `logical` value, whether to use multicore, passed into call to `blapply` (see respective documentation)

Examples

use("polmineR")
#> ... activating corpus: GERMAPARLMINI (version: 0.0.1 | build date: 2019-02-23)
#> ... activating corpus: REUTERS
P <- partition("GERMAPARLMINI", date = "2009-10-27")
#> ... get encoding: latin1
#> ... get cpos and strucs
ngramObject <- ngrams(P, n = 2, p_attribute = "word", char = NULL)

# a more complex scenario: get most frequent ADJA/NN-combinations
ngramObject <- ngrams(P, n = 2, p_attribute = c("word", "pos"), char = NULL)
ngramObject2 <- subset(
 ngramObject,
 ngramObject[["1_pos"]] == "ADJA"  & ngramObject[["2_pos"]] == "NN"
 )
#> Error in .checkTypos(e, names_x): Object 'ngramObject' not found amongst word_1, pos_1, word_2, pos_2, count
ngramObject2@stat[, "1_pos" := NULL][, "2_pos" := NULL]
#> Error in eval(expr, envir, enclos): object 'ngramObject2' not found
ngramObject3 <- sort(ngramObject2, by = "count")
#> Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'sort': object 'ngramObject2' not found
head(ngramObject3)
#> Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'head': object 'ngramObject3' not found
use("polmineR")
#> ... activating corpus: GERMAPARLMINI (version: 0.0.1 | build date: 2019-02-23)
#> ... activating corpus: REUTERS
dt <- decode("REUTERS", p_attribute = "word", s_attribute = character(), to = "data.table")
#> decoding p-attribute:word
#> assembling data.table
y <- ngrams(dt, n = 3L, p_attribute = "word")