MCE::Grep(3pm)

1MCE::Grep(3)          User Contributed Perl Documentation         MCE::Grep(3)
2
3
4

NAME

6       MCE::Grep - Parallel grep model similar to the native grep function
7

VERSION

9       This document describes MCE::Grep version 1.889
10

SYNOPSIS

12        ## Exports mce_grep, mce_grep_f, and mce_grep_s
13        use MCE::Grep;
14
15        ## Array or array_ref
16        my @a = mce_grep { $_ % 5 == 0 } 1..10000;
17        my @b = mce_grep { $_ % 5 == 0 } \@list;
18
19        ## Important; pass an array_ref for deeply input data
20        my @c = mce_grep { $_->[1] % 2 == 0 } [ [ 0, 1 ], [ 0, 2 ], ... ];
21        my @d = mce_grep { $_->[1] % 2 == 0 } \@deeply_list;
22
23        ## File path, glob ref, IO::All::{ File, Pipe, STDIO } obj, or scalar ref
24        ## Workers read directly and not involve the manager process
25        my @e = mce_grep_f { /pattern/ } "/path/to/file"; # efficient
26
27        ## Involves the manager process, therefore slower
28        my @f = mce_grep_f { /pattern/ } $file_handle;
29        my @g = mce_grep_f { /pattern/ } $io;
30        my @h = mce_grep_f { /pattern/ } \$scalar;
31
32        ## Sequence of numbers (begin, end [, step, format])
33        my @i = mce_grep_s { %_ * 3 == 0 } 1, 10000, 5;
34        my @j = mce_grep_s { %_ * 3 == 0 } [ 1, 10000, 5 ];
35
36        my @k = mce_grep_s { %_ * 3 == 0 } {
37           begin => 1, end => 10000, step => 5, format => undef
38        };
39

DESCRIPTION

41       This module provides a parallel grep implementation via Many-Core
42       Engine.  MCE incurs a small overhead due to passing of data. A fast
43       code block will run faster natively. However, the overhead will likely
44       diminish as the complexity increases for the code.
45
46        my @m1 =     grep { $_ % 5 == 0 } 1..1000000;          ## 0.065 secs
47        my @m2 = mce_grep { $_ % 5 == 0 } 1..1000000;          ## 0.194 secs
48
49       Chunking, enabled by default, greatly reduces the overhead behind the
50       scene.  The time for mce_grep below also includes the time for data
51       exchanges between the manager and worker processes. More
52       parallelization will be seen when the code incurs additional CPU time.
53
54        my @m1 =     grep { /[2357][1468][9]/ } 1..1000000;    ## 0.353 secs
55        my @m2 = mce_grep { /[2357][1468][9]/ } 1..1000000;    ## 0.218 secs
56
57       Even faster is mce_grep_s; useful when input data is a range of
58       numbers.  Workers generate sequences mathematically among themselves
59       without any interaction from the manager process. Two arguments are
60       required for mce_grep_s (begin, end). Step defaults to 1 if begin is
61       smaller than end, otherwise -1.
62
63        my @m3 = mce_grep_s { /[2357][1468][9]/ } 1, 1000000;  ## 0.165 secs
64
65       Although this document is about MCE::Grep, the MCE::Stream module can
66       write results immediately without waiting for all chunks to complete.
67       This is made possible by passing the reference to an array (in this
68       case @m4 and @m5).
69
70        use MCE::Stream default_mode => 'grep';
71
72        my @m4; mce_stream \@m4, sub { /[2357][1468][9]/ }, 1..1000000;
73
74           ## Completed in 0.203 secs. This is amazing considering the
75           ## overhead for passing data between the manager and workers.
76
77        my @m5; mce_stream_s \@m5, sub { /[2357][1468][9]/ }, 1, 1000000;
78
79           ## Completed in 0.120 secs. Like with mce_grep_s, specifying a
80           ## sequence specification turns out to be faster due to lesser
81           ## overhead for the manager process.
82
83       A common scenario is grepping for pattern(s) inside a massive log file.
84       Notice how parallelism increases as complexity increases for the
85       pattern.  Testing was done against a 300 MB file containing 250k lines.
86
87        use MCE::Grep;
88
89        my @m; open my $LOG, "<", "/path/to/log/file" or die "$!\n";
90
91        @m = grep { /pattern/ } <$LOG>;                      ##  0.756 secs
92        @m = grep { /foobar|[2357][1468][9]/ } <$LOG>;       ## 24.681 secs
93
94        ## Parallelism with mce_grep. This involves the manager process
95        ## due to processing a file handle.
96
97        @m = mce_grep { /pattern/ } <$LOG>;                  ##  0.997 secs
98        @m = mce_grep { /foobar|[2357][1468][9]/ } <$LOG>;   ##  7.439 secs
99
100        ## Even faster with mce_grep_f. Workers access the file directly
101        ## with zero interaction from the manager process.
102
103        my $LOG = "/path/to/file";
104        @m = mce_grep_f { /pattern/ } $LOG;                  ##  0.112 secs
105        @m = mce_grep_f { /foobar|[2357][1468][9]/ } $LOG;   ##  6.840 secs
106

PARSING HUGE FILES

108       The MCE::Grep module lacks an optimization for quickly determining if a
109       match is found from not knowing the pattern inside the code block. Use
110       the following snippet as a template to achieve better performance.
111       Also, take a look at examples/egrep.pl, included with the distribution.
112
113        use MCE::Loop;
114
115        MCE::Loop->init(
116           max_workers => 8, use_slurpio => 1
117        );
118
119        my $pattern  = 'karl';
120        my $hugefile = 'very_huge.file';
121
122        my @result = mce_loop_f {
123           my ($mce, $slurp_ref, $chunk_id) = @_;
124
125           ## Quickly determine if a match is found.
126           ## Process slurped chunk only if true.
127
128           if ($$slurp_ref =~ /$pattern/m) {
129              my @matches;
130
131              ## The following is fast on Unix. Performance degrades
132              ## drastically on Windows beyond 4 workers.
133
134              open my $MEM_FH, '<', $slurp_ref;
135              binmode $MEM_FH, ':raw';
136              while (<$MEM_FH>) { push @matches, $_ if (/$pattern/); }
137              close   $MEM_FH;
138
139              ## Therefore, use the following construct on Windows.
140
141              while ( $$slurp_ref =~ /([^\n]+\n)/mg ) {
142                 my $line = $1; # save $1 to not lose the value
143                 push @matches, $line if ($line =~ /$pattern/);
144              }
145
146              ## Gather matched lines.
147
148              MCE->gather(@matches);
149           }
150
151        } $hugefile;
152
153        print join('', @result);
154

OVERRIDING DEFAULTS

156       The following list options which may be overridden when loading the
157       module.
158
159        use Sereal qw( encode_sereal decode_sereal );
160        use CBOR::XS qw( encode_cbor decode_cbor );
161        use JSON::XS qw( encode_json decode_json );
162
163        use MCE::Grep
164            max_workers => 4,                # Default 'auto'
165            chunk_size => 100,               # Default 'auto'
166            tmp_dir => "/path/to/app/tmp",   # $MCE::Signal::tmp_dir
167            freeze => \&encode_sereal,       # \&Storable::freeze
168            thaw => \&decode_sereal,         # \&Storable::thaw
169            init_relay => 0,                 # Default undef; MCE 1.882+
170            use_threads => 0,                # Default undef; MCE 1.882+
171        ;
172
173       From MCE 1.8 onwards, Sereal 3.015+ is loaded automatically if
174       available.  Specify "Sereal => 0" to use Storable instead.
175
176        use MCE::Grep Sereal => 0;
177

CUSTOMIZING MCE

179       MCE::Grep->init ( options )
180       MCE::Grep::init { options }
181
182       The init function accepts a hash of MCE options. The gather option, if
183       specified, is ignored due to being used internally by the module.
184
185        use MCE::Grep;
186
187        MCE::Grep->init(
188           chunk_size => 1, max_workers => 4,
189
190           user_begin => sub {
191              print "## ", MCE->wid, " started\n";
192           },
193
194           user_end => sub {
195              print "## ", MCE->wid, " completed\n";
196           }
197        );
198
199        my @a = mce_grep { $_ % 5 == 0 } 1..100;
200
201        print "\n", "@a", "\n";
202
203        -- Output
204
205        ## 2 started
206        ## 3 started
207        ## 1 started
208        ## 4 started
209        ## 3 completed
210        ## 4 completed
211        ## 1 completed
212        ## 2 completed
213
214        5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100
215

API DOCUMENTATION

217       MCE::Grep->run ( sub { code }, list )
218       mce_grep { code } list
219
220       Input data may be defined using a list or an array reference. Unlike
221       MCE::Loop, Flow, and Step, specifying a hash reference as input data
222       isn't allowed.
223
224        ## Array or array_ref
225        my @a = mce_grep { /[2357]/ } 1..1000;
226        my @b = mce_grep { /[2357]/ } \@list;
227
228        ## Important; pass an array_ref for deeply input data
229        my @c = mce_grep { $_->[1] =~ /[2357]/ } [ [ 0, 1 ], [ 0, 2 ], ... ];
230        my @d = mce_grep { $_->[1] =~ /[2357]/ } \@deeply_list;
231
232        ## Not supported
233        my @z = mce_grep { ... } \%hash;
234
235       MCE::Grep->run_file ( sub { code }, file )
236       mce_grep_f { code } file
237
238       The fastest of these is the /path/to/file. Workers communicate the next
239       offset position among themselves with zero interaction by the manager
240       process.
241
242       "IO::All" { File, Pipe, STDIO } is supported since MCE 1.845.
243
244        my @c = mce_grep_f { /pattern/ } "/path/to/file";  # faster
245        my @d = mce_grep_f { /pattern/ } $file_handle;
246        my @e = mce_grep_f { /pattern/ } $io;              # IO::All
247        my @f = mce_grep_f { /pattern/ } \$scalar;
248
249       MCE::Grep->run_seq ( sub { code }, $beg, $end [, $step, $fmt ] )
250       mce_grep_s { code } $beg, $end [, $step, $fmt ]
251
252       Sequence may be defined as a list, an array reference, or a hash
253       reference.  The functions require both begin and end values to run.
254       Step and format are optional. The format is passed to sprintf (% may be
255       omitted below).
256
257        my ($beg, $end, $step, $fmt) = (10, 20, 0.1, "%4.1f");
258
259        my @f = mce_grep_s { /[1234]\.[5678]/ } $beg, $end, $step, $fmt;
260        my @g = mce_grep_s { /[1234]\.[5678]/ } [ $beg, $end, $step, $fmt ];
261
262        my @h = mce_grep_s { /[1234]\.[5678]/ } {
263           begin => $beg, end => $end,
264           step => $step, format => $fmt
265        };
266
267       MCE::Grep->run ( sub { code }, iterator )
268       mce_grep { code } iterator
269
270       An iterator reference may be specified for input_data. Iterators are
271       described under section "SYNTAX for INPUT_DATA" at MCE::Core.
272
273        my @a = mce_grep { $_ % 3 == 0 } make_iterator(10, 30, 2);
274

MANUAL SHUTDOWN

276       MCE::Grep->finish
277       MCE::Grep::finish
278
279       Workers remain persistent as much as possible after running. Shutdown
280       occurs automatically when the script terminates. Call finish when
281       workers are no longer needed.
282
283        use MCE::Grep;
284
285        MCE::Grep->init(
286           chunk_size => 20, max_workers => 'auto'
287        );
288
289        my @a = mce_grep { ... } 1..100;
290
291        MCE::Grep->finish;
292

INDEX

294       MCE, MCE::Core
295

AUTHOR

297       Mario E. Roy, <marioeroy AT gmail DOT com>
298
299
300
301perl v5.38.0                      2023-09-14                      MCE::Grep(3)