1KinoSearch1::Analysis::UTsoekrenCioznetrr(i3b)uted PerlKDioncouSmeeanrtcaht1i:o:nAnalysis::Tokenizer(3)
2
3
4

NAME

6       KinoSearch1::Analysis::Tokenizer - customizable tokenizing
7

SYNOPSIS

9           my $whitespace_tokenizer
10               = KinoSearch1::Analysis::Tokenizer->new( token_re => qr/\S+/, );
11
12           # or...
13           my $word_char_tokenizer
14               = KinoSearch1::Analysis::Tokenizer->new( token_re => qr/\w+/, );
15
16           # or...
17           my $apostrophising_tokenizer = KinoSearch1::Analysis::Tokenizer->new;
18
19           # then... once you have a tokenizer, put it into a PolyAnalyzer
20           my $polyanalyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
21               analyzers => [ $lc_normalizer, $word_char_tokenizer, $stemmer ], );
22

DESCRIPTION

24       Generically, "tokenizing" is a process of breaking up a string into an
25       array of "tokens".
26
27           # before:
28           my $string = "three blind mice";
29
30           # after:
31           @tokens = qw( three blind mice );
32
33       KinoSearch1::Analysis::Tokenizer decides where it should break up the
34       text based on the value of "token_re".
35
36           # before:
37           my $string = "Eats, Shoots and Leaves.";
38
39           # tokenized by $whitespace_tokenizer
40           @tokens = qw( Eats, Shoots and Leaves. );
41
42           # tokenized by $word_char_tokenizer
43           @tokens = qw( Eats Shoots and Leaves   );
44

METHODS

46   new
47           # match "O'Henry" as well as "Henry" and "it's" as well as "it"
48           my $token_re = qr/
49                   \b        # start with a word boundary
50                   \w+       # Match word chars.
51                   (?:       # Group, but don't capture...
52                      '\w+   # ... an apostrophe plus word chars.
53                   )?        # Matching the apostrophe group is optional.
54                   \b        # end with a word boundary
55               /xsm;
56           my $tokenizer = KinoSearch1::Analysis::Tokenizer->new(
57               token_re => $token_re, # default: what you see above
58           );
59
60       Constructor.  Takes one hash style parameter.
61
62token_re - must be a pre-compiled regular expression matching one
63           token.
64
66       Copyright 2005-2010 Marvin Humphrey
67

LICENSE, DISCLAIMER, BUGS, etc.

69       See KinoSearch1 version 1.01.
70
71
72
73perl v5.32.1                      2021-01-27KinoSearch1::Analysis::Tokenizer(3)
Impressum