Get IDs and Counts for Region Matrices.

region_matrix_to_ids(
  corpus,
  p_attribute,
  registry = Sys.getenv("CORPUS_REGISTRY"),
  matrix
)

region_matrix_to_count_matrix(
  corpus,
  p_attribute,
  registry = Sys.getenv("CORPUS_REGISTRY"),
  matrix
)

region_matrix_context(
  corpus,
  registry = Sys.getenv("CORPUS_REGISTRY"),
  matrix,
  p_attribute,
  s_attribute,
  boundary,
  left,
  right
)

ranges_to_cpos(ranges)

Arguments

corpus

a CWB corpus

p_attribute

a positional attribute

registry

registry directory

matrix

a regions matrix

s_attribute

If not NULL, a structural attribute (length-one character vector), typically indicating a sentence ("s").

boundary

Structural attribute (length-one character vector) that serves as a boundary and that shall not be transgressed.

left

An integer value, number of strucs to move to the left.

right

An integer value, number of strucs to move to the right.

ranges

A two-column integer matrix of ranges (left and right corpus positions in first and second column, respectively).

Details

ranges_to_cpos() will turn a matrix of ranges into an integer vector with the individual corpus positions covered by the ranges.

Examples

# Scenario 1: Get full text for a subcorpus defined by regions
m <- get_region_matrix(
  corpus = "REUTERS", s_attribute = "places",
  strucs = 4L:5L, registry = get_tmp_registry()
  )
ids <- region_matrix_to_ids(
  corpus = "REUTERS", p_attribute = "word",
  registry = get_tmp_registry(), matrix = m
  )
tokenstream <- cl_id2str(
  corpus = "REUTERS", p_attribute = "word",
  registry = get_tmp_registry(), id = ids
  )
txt <- paste(tokenstream, collapse = " ")
txt
#> [1] "Houston Oil Trust said that independent petroleum engineers completed an annual study that estimates the trust's future net revenues from total proved reserves at 88 mln dlrs and its discounted present value of the reserves at 64 mln dlrs Based on the estimate the trust said there may be no money available for cash distributions to unitholders for the remainder of the year It said the estimates reflect a decrease of about 44 pct in net reserve revenues and 39 pct in discounted present value compared with the study made in 1985 Reuter Kuwait s Oil Minister in remarks published today said there were no plans for an emergency OPEC meeting to review oil policies after recent weakness in world oil prices Sheikh Ali al Khalifa al Sabah was quoted by the local daily al Qabas as saying None of the OPEC members has asked for such a meeting He denied Kuwait was pumping above its quota of 948,000 barrels of crude daily bpd set under self imposed production limits of the 13 nation organisation Traders and analysts in international oil markets estimate OPEC is producing up to one mln bpd above a ceiling of 15.8 mln bpd agreed in Geneva last December They named Kuwait and the United Arab Emirates along with the much smaller producer Ecuador among those producing above quota Kuwait they said was pumping 1.2 mln bpd This rumour is baseless It is based on reports which said Kuwait has the ability to exceed its share They suppose that because Kuwait has the ability it will do so the minister said Sheikh Ali has said before that Kuwait had the ability to produce up to 4.0 mln bpd If we can sell more than our quota at official prices while some countries are suffering difficulties marketing their share it means we in Kuwait are unusually clever he said He was referring apparently to the Gulf state of qatar which industry sources said was selling less than 180,000 bpd of its 285,000 bpd quota because buyers were resisting official prices restored by OPEC last month pegged to a marker of 18 dlrs per barrel Prices in New York last week dropped to their lowest levels this year and almost three dollars below a three month high of 19 dollars a barrel Sheikh Ali also delivered a challenge to any international oil company that declared Kuwait sold below official prices Because it was charging its official price of 16.67 dlrs a barrel it had lost custom he said but did not elaborate However Kuwait had guaranteed markets for its oil because of its local and international refining facilities and its own distribution network abroad he added He reaffirmed that the planned meeting March 7 of OPEC s differentials committee has been postponed until the start of April at the request of certain of the body s members Ecuador s deputy energy minister Fernando Santos Alvite said last Wednesday his debt burdened country wanted OPEC to assign a lower official price for its crude and was to seek this at talks this month of opec s pricing committee Referring to pressure by oil companies on OPEC members in apparent reference to difficulties faced by Qatar he said We expected such pressure It will continue through March and April But he expected the situation would later improve REUTER"

# Scenario 2: Get data.frame with counts for region matrix
y <- region_matrix_to_count_matrix(
  corpus = "REUTERS", p_attribute = "word",
  registry = get_tmp_registry(), matrix = m
  )
df <- as.data.frame(y)
colnames(df) <- c("token_id", "count")
df[["token"]] <- cl_id2str(
  "REUTERS", p_attribute = "word",
  registry = get_tmp_registry(), id = df[["token_id"]]
  )
df[order(df[["count"]], decreasing = TRUE),]
#>     token_id count         token
#> 17        31    23           the
#> 21        37    19            of
#> 16        29    14            to
#> 1          3    13          said
#> 20        35    10            in
#> 135      347    10        Kuwait
#> 6         10     9           its
#> 13        19     9             a
#> 22        40     9           and
#> 18        33     7           was
#> 29        59     7          OPEC
#> 57       150     7           bpd
#> 2          4     6          that
#> 8         13     6           for
#> 10        15     6           oil
#> 56       149     6           mln
#> 61       154     5           has
#> 66       171     5            he
#> 81       214     5            at
#> 136      348     5             s
#> 202      414     5      official
#> 4          7     4            it
#> 7         12     4        prices
#> 11        16     4            by
#> 12        18     4          dlrs
#> 26        53     4          last
#> 58       151     4         quota
#> 5          8     3           had
#> 14        20     3        barrel
#> 24        45     3            is
#> 40        90     3       meeting
#> 46       123     3            He
#> 48       128     3       ability
#> 54       147     3         above
#> 65       169     3            on
#> 86       249     3          this
#> 127      339     3            It
#> 146      358     3        Sheikh
#> 147      359     3           Ali
#> 148      360     3            al
#> 157      369     3       members
#> 171      383     3 international
#> 196      408     3       because
#> 222      434     3         month
#> 9         14     2         crude
#> 15        25     2         price
#> 27        57     2       markets
#> 37        86     2          They
#> 38        88     2            an
#> 45       119     2          with
#> 52       143     2          were
#> 72       193     2          will
#> 75       201     2         their
#> 76       203     2         March
#> 77       204     2         April
#> 80       211     2          than
#> 88       259     2            we
#> 89       260     2           are
#> 94       306     2           Oil
#> 101      313     2         study
#> 102      314     2     estimates
#> 105      317     2           net
#> 106      318     2      revenues
#> 110      322     2      reserves
#> 112      324     2    discounted
#> 113      325     2       present
#> 114      326     2         value
#> 116      328     2      estimate
#> 118      330     2         there
#> 119      331     2            no
#> 126      338     2          year
#> 130      342     2           pct
#> 152      364     2         local
#> 153      365     2         daily
#> 159      371     2          such
#> 161      373     2       pumping
#> 172      384     2     producing
#> 173      385     2            up
#> 184      396     2       Ecuador
#> 192      404     2         which
#> 194      406     2         share
#> 197      409     2      minister
#> 206      418     2  difficulties
#> 233      445     2         three
#> 234      446     2       dollars
#> 235      447     2         below
#> 260      472     2     committee
#> 285      497     2      pressure
#> 291      503     2      expected
#> 3          6     1         today
#> 19        34     1          made
#> 23        43     1       company
#> 25        49     1     companies
#> 28        58     1        Reuter
#> 30        60     1           may
#> 31        61     1            be
#> 32        64     1        before
#> 33        69     1    production
#> 34        78     1      industry
#> 35        79     1      analysts
#> 36        83     1            as
#> 39        89     1     emergency
#> 41       103     1       sources
#> 42       108     1         world
#> 43       110     1           not
#> 44       111     1           but
#> 47       127     1         about
#> 49       130     1         under
#> 50       140     1           But
#> 51       141     1          some
#> 53       146     1         would
#> 55       148     1          15.8
#> 59       152     1           set
#> 60       153     1      December
#> 62       156     1        buyers
#> 63       162     1 differentials
#> 64       165     1           one
#> 67       172     1         named
#> 68       179     1         added
#> 69       185     1       reports
#> 70       189     1       However
#> 71       191     1           can
#> 73       194     1          they
#> 74       200     1          sell
#> 78       208     1            do
#> 79       210     1          more
#> 82       234     1           New
#> 83       235     1          York
#> 84       244     1          been
#> 85       246     1       through
#> 87       257     1            so
#> 90       272     1            64
#> 91       275     1      decrease
#> 92       289     1            19
#> 93       305     1       Houston
#> 95       307     1         Trust
#> 96       308     1   independent
#> 97       309     1     petroleum
#> 98       310     1     engineers
#> 99       311     1     completed
#> 100      312     1        annual
#> 103      315     1       trust's
#> 104      316     1        future
#> 107      319     1          from
#> 108      320     1         total
#> 109      321     1        proved
#> 111      323     1            88
#> 115      327     1         Based
#> 117      329     1         trust
#> 120      332     1         money
#> 121      333     1     available
#> 122      334     1          cash
#> 123      335     1 distributions
#> 124      336     1   unitholders
#> 125      337     1     remainder
#> 128      340     1       reflect
#> 129      341     1            44
#> 131      343     1       reserve
#> 132      344     1            39
#> 133      345     1      compared
#> 134      346     1          1985
#> 137      349     1      Minister
#> 138      350     1       remarks
#> 139      351     1     published
#> 140      352     1         plans
#> 141      353     1        review
#> 142      354     1      policies
#> 143      355     1         after
#> 144      356     1        recent
#> 145      357     1      weakness
#> 149      361     1       Khalifa
#> 150      362     1         Sabah
#> 151      363     1        quoted
#> 154      366     1         Qabas
#> 155      367     1        saying
#> 156      368     1          None
#> 158      370     1         asked
#> 160      372     1        denied
#> 162      374     1       948,000
#> 163      375     1       barrels
#> 164      376     1          self
#> 165      377     1       imposed
#> 166      378     1        limits
#> 167      379     1            13
#> 168      380     1        nation
#> 169      381     1  organisation
#> 170      382     1       Traders
#> 174      386     1       ceiling
#> 175      387     1        agreed
#> 176      388     1        Geneva
#> 177      389     1        United
#> 178      390     1          Arab
#> 179      391     1      Emirates
#> 180      392     1         along
#> 181      393     1          much
#> 182      394     1       smaller
#> 183      395     1      producer
#> 185      397     1         among
#> 186      398     1         those
#> 187      399     1           1.2
#> 188      400     1          This
#> 189      401     1        rumour
#> 190      402     1      baseless
#> 191      403     1         based
#> 193      405     1        exceed
#> 195      407     1       suppose
#> 198      410     1       produce
#> 199      411     1           4.0
#> 200      412     1            If
#> 201      413     1           our
#> 203      415     1         while
#> 204      416     1     countries
#> 205      417     1     suffering
#> 207      419     1     marketing
#> 208      420     1         means
#> 209      421     1     unusually
#> 210      422     1        clever
#> 211      423     1     referring
#> 212      424     1    apparently
#> 213      425     1          Gulf
#> 214      426     1         state
#> 215      427     1         qatar
#> 216      428     1       selling
#> 217      429     1          less
#> 218      430     1       180,000
#> 219      431     1       285,000
#> 220      432     1     resisting
#> 221      433     1      restored
#> 223      435     1        pegged
#> 224      436     1        marker
#> 225      437     1            18
#> 226      438     1           per
#> 227      439     1        Prices
#> 228      440     1          week
#> 229      441     1       dropped
#> 230      442     1        lowest
#> 231      443     1        levels
#> 232      444     1        almost
#> 236      448     1          high
#> 237      449     1          also
#> 238      450     1     delivered
#> 239      451     1     challenge
#> 240      452     1           any
#> 241      453     1      declared
#> 242      454     1          sold
#> 243      455     1       Because
#> 244      456     1      charging
#> 245      457     1         16.67
#> 246      458     1          lost
#> 247      459     1        custom
#> 248      460     1           did
#> 249      461     1     elaborate
#> 250      462     1    guaranteed
#> 251      463     1      refining
#> 252      464     1    facilities
#> 253      465     1           own
#> 254      466     1  distribution
#> 255      467     1       network
#> 256      468     1        abroad
#> 257      469     1    reaffirmed
#> 258      470     1       planned
#> 259      471     1             7
#> 261      473     1     postponed
#> 262      474     1         until
#> 263      475     1         start
#> 264      476     1       request
#> 265      477     1       certain
#> 266      478     1          body
#> 267      479     1        deputy
#> 268      480     1        energy
#> 269      481     1      Fernando
#> 270      482     1        Santos
#> 271      483     1        Alvite
#> 272      484     1     Wednesday
#> 273      485     1           his
#> 274      486     1          debt
#> 275      487     1      burdened
#> 276      488     1       country
#> 277      489     1        wanted
#> 278      490     1        assign
#> 279      491     1         lower
#> 280      492     1          seek
#> 281      493     1         talks
#> 282      494     1          opec
#> 283      495     1       pricing
#> 284      496     1     Referring
#> 286      498     1      apparent
#> 287      499     1     reference
#> 288      500     1         faced
#> 289      501     1         Qatar
#> 290      502     1            We
#> 292      504     1      continue
#> 293      505     1     situation
#> 294      506     1         later
#> 295      507     1       improve
#> 296      508     1        REUTER
head(df)
#>   token_id count token
#> 1        3    13  said
#> 2        4     6  that
#> 3        6     1 today
#> 4        7     4    it
#> 5        8     3   had
#> 6       10     9   its