MCE::Grep(3pm)

1MCE::Grep(3)          User Contributed Perl Documentation         MCE::Grep(3)
2
3
4

NAME

6       MCE::Grep - Parallel grep model similar to the native grep function
7

VERSION

9       This document describes MCE::Grep version 1.874
10

SYNOPSIS

12        ## Exports mce_grep, mce_grep_f, and mce_grep_s
13        use MCE::Grep;
14
15        ## Array or array_ref
16        my @a = mce_grep { $_ % 5 == 0 } 1..10000;
17        my @b = mce_grep { $_ % 5 == 0 } \@list;
18
19        ## Important; pass an array_ref for deeply input data
20        my @c = mce_grep { $_->[1] % 2 == 0 } [ [ 0, 1 ], [ 0, 2 ], ... ];
21        my @d = mce_grep { $_->[1] % 2 == 0 } \@deeply_list;
22
23        ## File path, glob ref, IO::All::{ File, Pipe, STDIO } obj, or scalar ref
24        ## Workers read directly and not involve the manager process
25        my @e = mce_grep_f { /pattern/ } "/path/to/file"; # efficient
26
27        ## Involves the manager process, therefore slower
28        my @f = mce_grep_f { /pattern/ } $file_handle;
29        my @g = mce_grep_f { /pattern/ } $io;
30        my @h = mce_grep_f { /pattern/ } \$scalar;
31
32        ## Sequence of numbers (begin, end [, step, format])
33        my @i = mce_grep_s { %_ * 3 == 0 } 1, 10000, 5;
34        my @j = mce_grep_s { %_ * 3 == 0 } [ 1, 10000, 5 ];
35
36        my @k = mce_grep_s { %_ * 3 == 0 } {
37           begin => 1, end => 10000, step => 5, format => undef
38        };
39

DESCRIPTION

41       This module provides a parallel grep implementation via Many-Core
42       Engine.  MCE incurs a small overhead due to passing of data. A fast
43       code block will run faster natively. However, the overhead will likely
44       diminish as the complexity increases for the code.
45
46        my @m1 =     grep { $_ % 5 == 0 } 1..1000000;          ## 0.065 secs
47        my @m2 = mce_grep { $_ % 5 == 0 } 1..1000000;          ## 0.194 secs
48
49       Chunking, enabled by default, greatly reduces the overhead behind the
50       scene.  The time for mce_grep below also includes the time for data
51       exchanges between the manager and worker processes. More
52       parallelization will be seen when the code incurs additional CPU time.
53
54        my @m1 =     grep { /[2357][1468][9]/ } 1..1000000;    ## 0.353 secs
55        my @m2 = mce_grep { /[2357][1468][9]/ } 1..1000000;    ## 0.218 secs
56
57       Even faster is mce_grep_s; useful when input data is a range of
58       numbers.  Workers generate sequences mathematically among themselves
59       without any interaction from the manager process. Two arguments are
60       required for mce_grep_s (begin, end). Step defaults to 1 if begin is
61       smaller than end, otherwise -1.
62
63        my @m3 = mce_grep_s { /[2357][1468][9]/ } 1, 1000000;  ## 0.165 secs
64
65       Although this document is about MCE::Grep, the MCE::Stream module can
66       write results immediately without waiting for all chunks to complete.
67       This is made possible by passing the reference to an array (in this
68       case @m4 and @m5).
69
70        use MCE::Stream default_mode => 'grep';
71
72        my @m4; mce_stream \@m4, sub { /[2357][1468][9]/ }, 1..1000000;
73
74           ## Completed in 0.203 secs. This is amazing considering the
75           ## overhead for passing data between the manager and workers.
76
77        my @m5; mce_stream_s \@m5, sub { /[2357][1468][9]/ }, 1, 1000000;
78
79           ## Completed in 0.120 secs. Like with mce_grep_s, specifying a
80           ## sequence specification turns out to be faster due to lesser
81           ## overhead for the manager process.
82
83       A common scenario is grepping for pattern(s) inside a massive log file.
84       Notice how parallelism increases as complexity increases for the
85       pattern.  Testing was done against a 300 MB file containing 250k lines.
86
87        use MCE::Grep;
88
89        my @m; open my $LOG, "<", "/path/to/log/file" or die "$!\n";
90
91        @m = grep { /pattern/ } <$LOG>;                      ##  0.756 secs
92        @m = grep { /foobar|[2357][1468][9]/ } <$LOG>;       ## 24.681 secs
93
94        ## Parallelism with mce_grep. This involves the manager process
95        ## due to processing a file handle.
96
97        @m = mce_grep { /pattern/ } <$LOG>;                  ##  0.997 secs
98        @m = mce_grep { /foobar|[2357][1468][9]/ } <$LOG>;   ##  7.439 secs
99
100        ## Even faster with mce_grep_f. Workers access the file directly
101        ## with zero interaction from the manager process.
102
103        my $LOG = "/path/to/file";
104        @m = mce_grep_f { /pattern/ } $LOG;                  ##  0.112 secs
105        @m = mce_grep_f { /foobar|[2357][1468][9]/ } $LOG;   ##  6.840 secs
106

PARSING HUGE FILES

108       The MCE::Grep module lacks an optimization for quickly determining if a
109       match is found from not knowing the pattern inside the code block. Use
110       the following snippet as a template to achieve better performance.
111       Also, take a look at examples/egrep.pl, included with the distribution.
112
113        use MCE::Loop;
114
115        MCE::Loop->init(
116           max_workers => 8, use_slurpio => 1
117        );
118
119        my $pattern  = 'karl';
120        my $hugefile = 'very_huge.file';
121
122        my @result = mce_loop_f {
123           my ($mce, $slurp_ref, $chunk_id) = @_;
124
125           ## Quickly determine if a match is found.
126           ## Process slurped chunk only if true.
127
128           if ($$slurp_ref =~ /$pattern/m) {
129              my @matches;
130
131              ## The following is fast on Unix. Performance degrades
132              ## drastically on Windows beyond 4 workers.
133
134              open my $MEM_FH, '<', $slurp_ref;
135              binmode $MEM_FH, ':raw';
136              while (<$MEM_FH>) { push @matches, $_ if (/$pattern/); }
137              close   $MEM_FH;
138
139              ## Therefore, use the following construct on Windows.
140
141              while ( $$slurp_ref =~ /([^\n]+\n)/mg ) {
142                 my $line = $1; # save $1 to not lose the value
143                 push @matches, $line if ($line =~ /$pattern/);
144              }
145
146              ## Gather matched lines.
147
148              MCE->gather(@matches);
149           }
150
151        } $hugefile;
152
153        print join('', @result);
154

OVERRIDING DEFAULTS

156       The following list options which may be overridden when loading the
157       module.
158
159        use Sereal qw( encode_sereal decode_sereal );
160        use CBOR::XS qw( encode_cbor decode_cbor );
161        use JSON::XS qw( encode_json decode_json );
162
163        use MCE::Grep
164            max_workers => 4,                # Default 'auto'
165            chunk_size => 100,               # Default 'auto'
166            tmp_dir => "/path/to/app/tmp",   # $MCE::Signal::tmp_dir
167            freeze => \&encode_sereal,       # \&Storable::freeze
168            thaw => \&decode_sereal          # \&Storable::thaw
169        ;
170
171       From MCE 1.8 onwards, Sereal 3.015+ is loaded automatically if
172       available.  Specify "Sereal => 0" to use Storable instead.
173
174        use MCE::Grep Sereal => 0;
175

CUSTOMIZING MCE

177       MCE::Grep->init ( options )
178       MCE::Grep::init { options }
179
180       The init function accepts a hash of MCE options. The gather option, if
181       specified, is ignored due to being used internally by the module.
182
183        use MCE::Grep;
184
185        MCE::Grep->init(
186           chunk_size => 1, max_workers => 4,
187
188           user_begin => sub {
189              print "## ", MCE->wid, " started\n";
190           },
191
192           user_end => sub {
193              print "## ", MCE->wid, " completed\n";
194           }
195        );
196
197        my @a = mce_grep { $_ % 5 == 0 } 1..100;
198
199        print "\n", "@a", "\n";
200
201        -- Output
202
203        ## 2 started
204        ## 3 started
205        ## 1 started
206        ## 4 started
207        ## 3 completed
208        ## 4 completed
209        ## 1 completed
210        ## 2 completed
211
212        5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100
213

API DOCUMENTATION

215       MCE::Grep->run ( sub { code }, list )
216       mce_grep { code } list
217
218       Input data may be defined using a list or an array reference. Unlike
219       MCE::Loop, Flow, and Step, specifying a hash reference as input data
220       isn't allowed.
221
222        ## Array or array_ref
223        my @a = mce_grep { /[2357]/ } 1..1000;
224        my @b = mce_grep { /[2357]/ } \@list;
225
226        ## Important; pass an array_ref for deeply input data
227        my @c = mce_grep { $_->[1] =~ /[2357]/ } [ [ 0, 1 ], [ 0, 2 ], ... ];
228        my @d = mce_grep { $_->[1] =~ /[2357]/ } \@deeply_list;
229
230        ## Not supported
231        my @z = mce_grep { ... } \%hash;
232
233       MCE::Grep->run_file ( sub { code }, file )
234       mce_grep_f { code } file
235
236       The fastest of these is the /path/to/file. Workers communicate the next
237       offset position among themselves with zero interaction by the manager
238       process.
239
240       "IO::All" { File, Pipe, STDIO } is supported since MCE 1.845.
241
242        my @c = mce_grep_f { /pattern/ } "/path/to/file";  # faster
243        my @d = mce_grep_f { /pattern/ } $file_handle;
244        my @e = mce_grep_f { /pattern/ } $io;              # IO::All
245        my @f = mce_grep_f { /pattern/ } \$scalar;
246
247       MCE::Grep->run_seq ( sub { code }, $beg, $end [, $step, $fmt ] )
248       mce_grep_s { code } $beg, $end [, $step, $fmt ]
249
250       Sequence may be defined as a list, an array reference, or a hash
251       reference.  The functions require both begin and end values to run.
252       Step and format are optional. The format is passed to sprintf (% may be
253       omitted below).
254
255        my ($beg, $end, $step, $fmt) = (10, 20, 0.1, "%4.1f");
256
257        my @f = mce_grep_s { /[1234]\.[5678]/ } $beg, $end, $step, $fmt;
258        my @g = mce_grep_s { /[1234]\.[5678]/ } [ $beg, $end, $step, $fmt ];
259
260        my @h = mce_grep_s { /[1234]\.[5678]/ } {
261           begin => $beg, end => $end,
262           step => $step, format => $fmt
263        };
264
265       MCE::Grep->run ( sub { code }, iterator )
266       mce_grep { code } iterator
267
268       An iterator reference may be specified for input_data. Iterators are
269       described under section "SYNTAX for INPUT_DATA" at MCE::Core.
270
271        my @a = mce_grep { $_ % 3 == 0 } make_iterator(10, 30, 2);
272

MANUAL SHUTDOWN

274       MCE::Grep->finish
275       MCE::Grep::finish
276
277       Workers remain persistent as much as possible after running. Shutdown
278       occurs automatically when the script terminates. Call finish when
279       workers are no longer needed.
280
281        use MCE::Grep;
282
283        MCE::Grep->init(
284           chunk_size => 20, max_workers => 'auto'
285        );
286
287        my @a = mce_grep { ... } 1..100;
288
289        MCE::Grep->finish;
290

INDEX

292       MCE, MCE::Core
293

AUTHOR

295       Mario E. Roy, <marioeroy AT gmail DOT com>
296
297
298
299perl v5.34.0                      2021-07-22                      MCE::Grep(3)