Get features by comparison. — features • polmineR

The features of two objects, usually a partition defining a corpus of interest (coi), and a partition defining a reference corpus (ref) are compared. The most important purpose is term extraction.

features(x, y, ...)

# S4 method for partition
features(x, y, included = FALSE, method = "chisquare", verbose = FALSE)

# S4 method for count
features(
  x,
  y,
  by = NULL,
  included = FALSE,
  method = "chisquare",
  verbose = TRUE
)

# S4 method for partition_bundle
features(
  x,
  y,
  included = FALSE,
  method = "chisquare",
  verbose = TRUE,
  mc = getOption("polmineR.mc"),
  progress = FALSE
)

# S4 method for count_bundle
features(
  x,
  y,
  included = FALSE,
  method = "chisquare",
  verbose = !progress,
  mc = getOption("polmineR.mc"),
  progress = FALSE
)

# S4 method for ngrams
features(x, y, included = FALSE, method = "chisquare", verbose = TRUE, ...)

# S4 method for Cooccurrences
features(x, y, included = FALSE, method = "ll", verbose = TRUE)

Arguments

x	A `partition` or `partition_bundle` object.
y	A `partition` object, it is assumed that the coi is a subcorpus of ref
...	further parameters
included	TRUE if coi is part of ref, defaults to FALSE
method	the statistical test to apply (chisquare or log likelihood)
verbose	A `logical` value, defaults to TRUE
by	the columns used for merging, if NULL (default), the p-attribute of x will be used
mc	logical, whether to use multicore
progress	logical

References

Baker, Paul (2006): Using Corpora in Discourse Analysis. London: continuum, p. 121-149 (ch. 6).

Manning, Christopher D.; Schuetze, Hinrich (1999): Foundations of Statistical Natural Language Processing. MIT Press: Cambridge, Mass., pp. 151-189 (ch. 5).

Author

Andreas Blaette

Examples

use("polmineR")
#> ... activating corpus: GERMAPARLMINI (version: 0.0.1 | build date: 2019-02-23)
#> ... activating corpus: REUTERS

kauder <- partition(
  "GERMAPARLMINI",
  speaker = "Volker Kauder", interjection = "speech",
  p_attribute = "word"
  )
#> ... get encoding: latin1
#> ... get cpos and strucs
#> ... getting counts for p-attribute(s): word
all <- partition("GERMAPARLMINI", interjection = "speech", p_attribute = "word")
#> ... get encoding: latin1
#> ... get cpos and strucs
#> ... getting counts for p-attribute(s): word

terms_kauder <- features(x = kauder, y = all, included = TRUE)
top100 <- subset(terms_kauder, rank_chisquare <= 100)
head(top100)
#>                word word_id.x count_coi word_id.y count_ref    exp_coi
#> 1:     Zusammenhalt      2538        12      2538        32 0.79902610
#> 2:           Räumen      5545         5      5545         4 0.16343716
#> 3:         Deswegen      4340        18      4340       105 2.23364113
#> 4: Ballungsgebieten      5543         2      5543         0 0.03631937
#> 5:        Höchstmaß      5422         2      5422         0 0.03631937
#> 6:           langer      5517         2      5517         0 0.03631937
#>    chisquare rank_chisquare
#> 1:  159.9600              1
#> 2:  145.7816              2
#> 3:  113.4208              3
#> 4:  108.1352              4
#> 5:  108.1352              5
#> 6:  108.1352              6

# a different way is to compare count objects
kauder_count <- as(kauder, "count")
all_count <- as(all, "count")
terms_kauder <- features(kauder_count, all_count, included = TRUE)
#> ... combining frequency lists
#> ... statistical test:  chisquare
top100 <- subset(terms_kauder, rank_chisquare <= 100)
head(top100)
#>                word word_id.x count_coi word_id.y count_ref    exp_coi
#> 1:     Zusammenhalt      2538        12      2538        32 0.79902610
#> 2:           Räumen      5545         5      5545         4 0.16343716
#> 3:         Deswegen      4340        18      4340       105 2.23364113
#> 4: Ballungsgebieten      5543         2      5543         0 0.03631937
#> 5:        Höchstmaß      5422         2      5422         0 0.03631937
#> 6:           langer      5517         2      5517         0 0.03631937
#>    chisquare rank_chisquare
#> 1:  159.9600              1
#> 2:  145.7816              2
#> 3:  113.4208              3
#> 4:  108.1352              4
#> 5:  108.1352              5
#> 6:  108.1352              6

speakers <- partition_bundle("GERMAPARLMINI", s_attribute = "speaker")
speakers <- enrich(speakers, p_attribute = "word")
speaker_terms <- features(speakers[[1:5]], all, included = TRUE, progress = TRUE)
dtm <- as.DocumentTermMatrix(speaker_terms, col = "chisquare")
#> ... using the p_attribute-slot of the first object in the bundle as p_attribute: word
#> ... generating (temporary) key column
#> ... generating cumulated data.table
#> ... getting unique keys
#> ... generating integer keys
#> ... cleaning up temporary key columns
# Get features of objects in a count_bundle
ref <- corpus("GERMAPARLMINI") %>% count(p_attribute = "word")
cois <- corpus("GERMAPARLMINI") %>%
  subset(speaker %in% c("Angela Dorothea Merkel", "Hubertus Heil")) %>%
  split(s_attribute = "speaker") %>%
  count(p_attribute = "word")
y <- features(cois, ref, included = TRUE, method = "chisquare", progress = TRUE)