Get the total number of unique tokens/ids of a positional attribute. Note that token ids are zero-based, i.e. when iterating through tokens, start at 0, the maximum will be cl_lexicon_size() minus 1.

cl_lexicon_size(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"))

Arguments

corpus

name of a CWB corpus (upper case)

p_attribute

name of positional attribute

registry

path to the registry directory, defaults to the value of the environment variable CORPUS_REGISTRY

Examples

lexicon_size <- cl_lexicon_size(
  "REUTERS",
  p_attribute = "word",
  registry = get_tmp_registry()
)

token_ids <- seq.int(from = 0, to = lexicon_size - 1)
cl_id2str(
  "REUTERS",
  p_attribute = "word",
  id = token_ids,
  registry = get_tmp_registry()
)
#>    [1] "Diamond"         "Shamrock"        "Corp"            "said"           
#>    [5] "that"            "effective"       "today"           "it"             
#>    [9] "had"             "cut"             "its"             "contract"       
#>   [13] "prices"          "for"             "crude"           "oil"            
#>   [17] "by"              "1.50"            "dlrs"            "a"              
#>   [21] "barrel"          "The"             "reduction"       "brings"         
#>   [25] "posted"          "price"           "West"            "Texas"          
#>   [29] "Intermediate"    "to"              "16.00"           "the"            
#>   [33] "copany"          "was"             "made"            "in"             
#>   [37] "light"           "of"              "falling"         "product"        
#>   [41] "and"             "weak"            "market"          "company"        
#>   [45] "spokeswoman"     "is"              "latest"          "line"           
#>   [49] "U.S"             "companies"       "have"            "or"             
#>   [53] "over"            "last"            "two"             "days"           
#>   [57] "citing"          "markets"         "Reuter"          "OPEC"           
#>   [61] "may"             "be"              "forced"          "meet"           
#>   [65] "before"          "scheduled"       "June"            "session"        
#>   [69] "readdress"       "production"      "cutting"         "agreement"      
#>   [73] "if"              "organization"    "wants"           "halt"           
#>   [77] "current"         "slide"           "industry"        "analysts"       
#>   [81] "movement"        "higher"          "never"           "as"             
#>   [85] "easy"            "thought"         "They"            "need"           
#>   [89] "an"              "emergency"       "meeting"         "sort"           
#>   [93] "out"             "problems"        "Daniel"          "Yergin"         
#>   [97] "director"        "Cambridge"       "Energy"          "Research"       
#>  [101] "Associates"      "CERA"            "Analysts"        "sources"        
#>  [105] "problem"         "faces"           "excess"          "supply"         
#>  [109] "world"           "OPEC's"          "not"             "but"            
#>  [113] "issue"           "must"            "addressed"       "way"            
#>  [117] "Paul"            "Mlotok"          "analyst"         "with"           
#>  [121] "Salomon"         "Brothers"        "Inc"             "He"             
#>  [125] "market's"        "earlier"         "optimism"        "about"          
#>  [129] "ability"         "keep"            "under"           "control"        
#>  [133] "given"           "pessimistic"     "outlook"         "address"        
#>  [137] "soon"            "wishes"          "regain"          "initiative"     
#>  [141] "But"             "some"            "other"           "were"           
#>  [145] "uncertain"       "even"            "would"           "above"          
#>  [149] "15.8"            "mln"             "bpd"             "quota"          
#>  [153] "set"             "December"        "has"             "learn"          
#>  [157] "buyers"          "you"             "cannot"          "deemed"         
#>  [161] "quotas"          "fixed"           "differentials"   "regional"       
#>  [165] "manager"         "one"             "major"           "who"            
#>  [169] "spoke"           "on"              "condition"       "he"             
#>  [173] "named"           "now"             "trying"          "teach"          
#>  [177] "them"            "lesson"          "again"           "added"          
#>  [181] "David"           "T"               "Mizrahi"         "editor"         
#>  [185] "Mideast"         "reports"         "expects"         "although"       
#>  [189] "immediately"     "However"         "optimistic"      "can"            
#>  [193] "principal"       "will"            "they"            "try"            
#>  [197] "take"            "advantage"       "winter"          "demand"         
#>  [201] "sell"            "their"           "late"            "March"          
#>  [205] "April"           "when"            "slackens"        "unlikely"       
#>  [209] "do"              "anything"        "more"            "than"           
#>  [213] "reiterate"       "output"          "at"              "next"           
#>  [217] "months"          "critical"        "hold"            "together"       
#>  [221] "pact"            "six"             "eight"           "weeks"          
#>  [225] "since"           "come"            "back"            "into"           
#>  [229] "then"            "Dillard"         "Spriggs"         "Petroleum"      
#>  [233] "Analysis"        "Ltd"             "New"             "York"           
#>  [237] "Bijan"           "Moussavar"       "Rahmani"         "Harvard"        
#>  [241] "University's"    "Environment"     "Policy"          "Center"         
#>  [245] "been"            "rising"          "through"         "first"          
#>  [249] "quarter"         "this"            "prompted"        "excesses"       
#>  [253] "Demand"          "clearly"         "probably"        "closer"         
#>  [257] "17"              "so"              "what"            "we"             
#>  [261] "are"             "seeing"          "characterized"   "cheating"       
#>  [265] "told"            "Reuters"         "telephone"       "interview"      
#>  [269] "Texaco"          "Canada"          "lowered"         "pay"            
#>  [273] "64"              "Canadian"        "cts"             "decrease"       
#>  [277] "company's"       "benchmark"       "grade"           "Edmonton"       
#>  [281] "Swann"           "Hills"           "Light"           "Sweet"          
#>  [285] "22.26"           "bbl"             "changed"         "postings"       
#>  [289] "Feb"             "19"              "Marathon"        "Co"             
#>  [293] "reduced"         "all"             "grades"          "dlr"            
#>  [297] "Marathon's"      "both"            "Sour"            "16.50"          
#>  [301] "South"           "Louisiana"       "16.85"           "Jan"            
#>  [305] "12"              "Houston"         "Oil"             "Trust"          
#>  [309] "independent"     "petroleum"       "engineers"       "completed"      
#>  [313] "annual"          "study"           "estimates"       "trust's"        
#>  [317] "future"          "net"             "revenues"        "from"           
#>  [321] "total"           "proved"          "reserves"        "88"             
#>  [325] "discounted"      "present"         "value"           "Based"          
#>  [329] "estimate"        "trust"           "there"           "no"             
#>  [333] "money"           "available"       "cash"            "distributions"  
#>  [337] "unitholders"     "remainder"       "year"            "It"             
#>  [341] "reflect"         "44"              "pct"             "reserve"        
#>  [345] "39"              "compared"        "1985"            "Kuwait"         
#>  [349] "s"               "Minister"        "remarks"         "published"      
#>  [353] "plans"           "review"          "policies"        "after"          
#>  [357] "recent"          "weakness"        "Sheikh"          "Ali"            
#>  [361] "al"              "Khalifa"         "Sabah"           "quoted"         
#>  [365] "local"           "daily"           "Qabas"           "saying"         
#>  [369] "None"            "members"         "asked"           "such"           
#>  [373] "denied"          "pumping"         "948,000"         "barrels"        
#>  [377] "self"            "imposed"         "limits"          "13"             
#>  [381] "nation"          "organisation"    "Traders"         "international"  
#>  [385] "producing"       "up"              "ceiling"         "agreed"         
#>  [389] "Geneva"          "United"          "Arab"            "Emirates"       
#>  [393] "along"           "much"            "smaller"         "producer"       
#>  [397] "Ecuador"         "among"           "those"           "1.2"            
#>  [401] "This"            "rumour"          "baseless"        "based"          
#>  [405] "which"           "exceed"          "share"           "suppose"        
#>  [409] "because"         "minister"        "produce"         "4.0"            
#>  [413] "If"              "our"             "official"        "while"          
#>  [417] "countries"       "suffering"       "difficulties"    "marketing"      
#>  [421] "means"           "unusually"       "clever"          "referring"      
#>  [425] "apparently"      "Gulf"            "state"           "qatar"          
#>  [429] "selling"         "less"            "180,000"         "285,000"        
#>  [433] "resisting"       "restored"        "month"           "pegged"         
#>  [437] "marker"          "18"              "per"             "Prices"         
#>  [441] "week"            "dropped"         "lowest"          "levels"         
#>  [445] "almost"          "three"           "dollars"         "below"          
#>  [449] "high"            "also"            "delivered"       "challenge"      
#>  [453] "any"             "declared"        "sold"            "Because"        
#>  [457] "charging"        "16.67"           "lost"            "custom"         
#>  [461] "did"             "elaborate"       "guaranteed"      "refining"       
#>  [465] "facilities"      "own"             "distribution"    "network"        
#>  [469] "abroad"          "reaffirmed"      "planned"         "7"              
#>  [473] "committee"       "postponed"       "until"           "start"          
#>  [477] "request"         "certain"         "body"            "deputy"         
#>  [481] "energy"          "Fernando"        "Santos"          "Alvite"         
#>  [485] "Wednesday"       "his"             "debt"            "burdened"       
#>  [489] "country"         "wanted"          "assign"          "lower"          
#>  [493] "seek"            "talks"           "opec"            "pricing"        
#>  [497] "Referring"       "pressure"        "apparent"        "reference"      
#>  [501] "faced"           "Qatar"           "We"              "expected"       
#>  [505] "continue"        "situation"       "later"           "improve"        
#>  [509] "REUTER"          "Indonesia"       "appears"         "nearing"        
#>  [513] "political"       "crossroads"      "measures"        "deregulate"     
#>  [517] "protected"       "economy"         "Embassy"         "says"           
#>  [521] "new"             "report"          "To"              "counter"        
#>  [525] "government"      "launched"        "series"          "past"           
#>  [529] "nine"            "boost"           "exports"         "outside"        
#>  [533] "sector"          "attract"         "investment"      "only"           
#>  [537] "Asian"           "member"          "leading"         "primary"        
#>  [541] "commodity"       "severely"        "hit"             "fall"           
#>  [545] "devalue"         "currency"        "31"              "September"      
#>  [549] "President"       "Suharto"         "divided"         "direction"      
#>  [553] "lead"            "regard"          "deregulation"    "pertains"       
#>  [557] "investments"     "imports"         "primarily"       "assesses"       
#>  [561] "agricultural"    "reviews"         "general"         "economic"       
#>  [565] "performance"     "many"            "officials"       "advisers"       
#>  [569] "recommending"    "further"         "relaxation"      "equally"        
#>  [573] "strong"          "pressures"       "being"           "exerted"        
#>  [577] "moves"           "group"           "strongly"        "favours"        
#>  [581] "import"          "substitution"    "changes"         "welcomed"       
#>  [585] "World"           "Bank"            "bankers"         "steps"          
#>  [589] "right"           "though"          "say"             "crucial"        
#>  [593] "areas"           "like"            "plastics"        "steel"          
#>  [597] "remain"          "highly"          "virtual"         "monopolies"     
#>  [601] "Three"           "sets"            "announced"       "May"            
#>  [605] "broadened"       "foreign"         "trade"           "restrictions"   
#>  [609] "liberalised"     "growth"          "calendar"        "1986"           
#>  [613] "zero"            "contracted"      "bit"             "rate"           
#>  [617] "mid"             "1960s"           "notes"           "largest"        
#>  [621] "East"            "Asia"            "population"      "168"            
#>  [625] "million"         "facing"          "elections"       "little"         
#>  [629] "hope"            "swift"           "improvement"     "For"            
#>  [633] "1987"            "early"           "indications"     "point"          
#>  [637] "slightly"        "positive"        "exceeding"       "Economic"       
#>  [641] "activity"        "continues"       "suffer"          "due"            
#>  [645] "sharp"           "export"          "earnings"        "Growth"         
#>  [649] "non"             "low"             "domestic"        "coupled"        
#>  [653] "excessive"       "plant"           "capacity"        "real"           
#>  [657] "declines"        "construction"    "level"           "agriculture"    
#>  [661] "states"          "Bankers"         "continuation"    "reforms"        
#>  [665] "get"             "lending"         "needs"           "A"              
#>  [669] "loan"            "300"             "balance"         "payments"       
#>  [673] "support"         "partly"          "help"            "maintain"       
#>  [677] "momentum"        "reform"          "Saudi"           "riyal"          
#>  [681] "interbank"       "deposits"        "steady"          "yesterday's"    
#>  [685] "quiet"           "reluctant"       "positions"       "amidst"         
#>  [689] "uncertainty"     "whether"         "succeed"         "halting"        
#>  [693] "decline"         "yesterday"       "several"         "producers"      
#>  [697] "difficulty"      "traditional"     "Sunday"          "lull"           
#>  [701] "trading"         "European"        "weekend"         "contributed"    
#>  [705] "lack"            "Spot"            "rates"           "put"            
#>  [709] "6"               "1"               "4"               "5"              
#>  [713] "3"               "quotes"          "ranging"         "between"        
#>  [717] "seven"           "One"             "unchanged"       "8"              
#>  [721] "respectively"    "spot"            "quietly"         "firmer"         
#>  [725] "3.7495"          "98"              "dollar"          "3.7500"         
#>  [729] "03"              "recovering"      "year's"          "budget"         
#>  [733] "projected"       "deficit"         "5.472"           "billion"        
#>  [737] "riyals"          "shortfall"       "7.3"             "86"             
#>  [741] "In"              "statement"       "outlining"       "fiscal"         
#>  [745] "beginning"       "Finance"         "Abdul"           "Aziz"           
#>  [749] "bin"             "Thani"           "spend"           "12.217"         
#>  [753] "period"          "Projected"       "expenditure"     "15.6"           
#>  [757] "revenue"         "6.745"           "down"            "30"             
#>  [761] "9.7"             "failed"          "publish"         "87"             
#>  [765] "surrounding"     "during"          "decided"         "limit"          
#>  [769] "recurrent"       "each"            "twelfth"         "previous"       
#>  [773] "allocations"     "minus"           "15"              "urged"          
#>  [777] "heads"           "departments"     "public"          "institutions"   
#>  [781] "rationalise"     "how"             "covered"         "taken"          
#>  [785] "order"           "relieve"         "burden"          "placed"         
#>  [789] "country's"       "2.766"           "allocated"       "projects"       
#>  [793] "including"       "housing"         "buildings"       "social"         
#>  [797] "services"        "health"          "education"       "transport"      
#>  [801] "communications"  "electricity"     "water"           "No"             
#>  [805] "figure"          "revealed"        "defence"         "security"       
#>  [809] "There"           "projection"      "day"             "Our"            
#>  [813] "expectations"    "signs"           "regarding"       "trends"         
#>  [817] "foremost"        "determination"   "shoulder"        "responsibilites"
#>  [821] "protect"         "wealth"          "helped"          "us"             
#>  [825] "make"            "reasonable"      "coming"          "basis"          
#>  [829] "assigned"        "Arabian"         "Hisham"          "Nazer"          
#>  [833] "reiterated"      "kingdom's"       "commitment"      "December's"     
#>  [837] "accord"          "stabilise"       "Press"           "Agency"         
#>  [841] "SPA"             "Asked"           "agency"          "free"           
#>  [845] "Arabia"          "fully"           "adhering"        "Accord"         
#>  [849] "pronounced"      "circumstance"    "end"             "northern"       
#>  [853] "hemisphere"      "season"          "glut"            "main"           
#>  [857] "architect"       "7.25"            "return"          "around"         
#>  [861] "followed"        "turmoil"         "saw"             "slump"          
#>  [865] "briefly"         "10"              "Free"            "currently"      
#>  [869] "just"            "16"              "Arabia's"        "adherence"      
#>  [873] "shown"           "contacts"        "showed"          "stick"          
#>  [877] "Jamaica"         "Rilwanu"         "Lukman"          "Nigerian"       
#>  [881] "aware"           "negative"        "forces"          "manipulate"     
#>  [885] "operations"      "satisfied"       "fundamentals"    "exist"          
#>  [889] "stable"          "conditions"      "Kuwait's"        "emirate's"      
#>  [893] "Al"              "fell"            "average"         "3.5"            
#>  [897] "3.8"             "January"         "Ras"             "Tanurah"        
#>  [901] "Ju'aymah"        "terminals"       "1.9"             "2.2"            
#>  [905] "liftings"        "customers"       "drop"            "rallied"        
#>  [909] "fourth"          "February"        "2.5"             "third"          
#>  [913] "figures"         "include"         "neutral"         "zone"           
#>  [917] "sales"           "floating"        "storage"         "generally"      
#>  [921] "considered"      "part"            "Opec"            "purposes"       
#>  [925] "4.133"           "restraint"       "scheme"          "approved"       
#>  [929] "averaging"       "fold"            "jump"            "appeared"       
#>  [933] "result"          "rushing"         "lift"            "entitlements"   
#>  [937] "Last"            "week's"          "show"            "continued"      
#>  [941] "four"            "ex"              "partners"        "Aramco"         
#>  [945] "Exxon"           "XON"             "Mobil"           "MOB"            
#>  [949] "TX"              "Chevron"         "CHV"             "signed"         
#>  [953] "long"            "term"            "buy"             "17.52"          
#>  [957] "test"            "products"        "traditionally"   "tapers"         
#>  [961] "off"             "fallen"          "refinery"        "throughput"     
#>  [965] "1.1"             "cuts"            "Yanbu"           "Jubail"         
#>  [969] "refineries"      "100,000"         "Bahrain's"       "remained"       
#>  [973] "200,000"         "Deputy"          "ministers"       "Bahrain"        
#>  [977] "discuss"         "coordination"    "news"            "WAM"            
#>  [981] "reported"        "discussing"      "implementation"  "Sunday's"       
#>  [985] "Doha"            "Cooperation"     "Council"         "GCC"            
#>  [989] "Four"            "UAE"             "Organiaation"    "Exporting"      
#>  [993] "Countries"       "face"            "stiff"           "buyer"          
#>  [997] "resistance"      "stabilize"       "newspaper"       "none"           
#> [1001] "Crude"           "sharply"         "traders"         "estimated"      
#> [1005] "port"            "Philadelphia"    "closed"          "Cypriot"        
#> [1009] "tanker"          "Seapride"        "II"              "ran"            
#> [1013] "aground"         "hitting"         "200"             "foot"           
#> [1017] "tower"           "supporting"      "power"           "lines"          
#> [1021] "across"          "river"           "Coast"           "Guard"          
#> [1025] "spokesman"       "spill"           "ship"            "lodged"         
#> [1029] "rocks"           "opposite"        "Hope"            "Creek"          
#> [1033] "nuclear"         "Jersey"          "hoped"           "refloat"        
#> [1037] "tide"            "After"           "delivering"      "Paulsboro"      
#> [1041] "steering"        "transmission"    "carrying"        "Delaware"       
#> [1045] "States"          "should"          "increase"        "strategic"      
#> [1049] "deal"            "impact"          "policy"          "raise"          
#> [1053] "750"             "500"             "overseas"        "embargo"        
#> [1057] "rise"            "Aspen"           "Institute"       "Humanistic"     
#> [1061] "Studies"         "private"         "called"          "research"       
#> [1065] "exploration"     "development"     "techniques"      "predicted"      
#> [1069] "years"           "20s"             "consumption"     "instead"        
#> [1073] "increasing"      "guard"           "against"         "mitigate"       
#> [1077] "risks"           "increased"       "cited"           "basic"          
#> [1081] "paths"           "fee"             "device"          "accept"         
#> [1085] "full"            "benefits"        "cheap"           "either"         
#> [1089] "option"          "drawbacks"       "Unocal"          "Corp's"         
#> [1093] "Union"           "eastern"         "region"          "26"             
#> [1097] "16.35"           "Union's"         "Mercantile"      "Exchange"       
#> [1101] "debut"           "procedure"       "complex"         "use"            
#> [1105] "futures"         "worldwide"       "On"              "NYMEX"          
#> [1109] "allow"           "position"        "initiate"        "exchange"       
#> [1113] "closes"          "transaction"     "subsequently"    "hedged"         
#> [1117] "according"       "change"          "transacted"      "Thomas"         
#> [1121] "McKiernan"       "chairman"        "Foreign"         "able"           
#> [1125] "hedge"           "trades"          "opens"           "negotiate"      
#> [1129] "differential"    "explained"       "expanded"        "program"        
#> [1133] "serve"           "does"            "close"           "Frank"          
#> [1137] "Capozza"         "secretary"       "Century"         "Resources"      
#> [1141] "rule"            "already"         "effect"          "platinum"       
#> [1145] "open"            "interest"        "liquidity"       "Currently"      
#> [1149] "least"           "trader"          "physical"        "EFP"            
#> [1153] "entering"        "Under"           "arrangement"     "neither"        
#> [1157] "party"           "parties"         "offset"          "When"           
#> [1161] "proposed"        "Rosemary"        "McFadden"        "Expansion"      
#> [1165] "provision"       "add"             "globalization"   "providing"      
#> [1169] "24"              "hour"            "Commodity"       "Futures"        
#> [1173] "Trading"         "Commission"      "CFTC"            "Argentine"      
#> [1177] "10.8"            "12.32"           "13.81"           "Yacimientos"    
#> [1181] "Petroliferos"    "Fiscales"        "natural"         "gas"            
#> [1185] "totalled"        "1.15"            "cubic"           "metrers"        
#> [1189] "3.6"             "1.11"            "metres"          "produced"