MCE::Grep(3pm)

1MCE::Grep(3)          User Contributed Perl Documentation         MCE::Grep(3)
2
3
4

NAME

6       MCE::Grep - Parallel grep model similar to the native grep function
7

VERSION

9       This document describes MCE::Grep version 1.837
10

SYNOPSIS

12        ## Exports mce_grep, mce_grep_f, and mce_grep_s
13        use MCE::Grep;
14
15        ## Array or array_ref
16        my @a = mce_grep { $_ % 5 == 0 } 1..10000;
17        my @b = mce_grep { $_ % 5 == 0 } [ 1..10000 ];
18
19        ## File_path, glob_ref, or scalar_ref
20        my @c = mce_grep_f { /pattern/ } "/path/to/file";
21        my @d = mce_grep_f { /pattern/ } $file_handle;
22        my @e = mce_grep_f { /pattern/ } \$scalar;
23
24        ## Sequence of numbers (begin, end [, step, format])
25        my @f = mce_grep_s { %_ * 3 == 0 } 1, 10000, 5;
26        my @g = mce_grep_s { %_ * 3 == 0 } [ 1, 10000, 5 ];
27
28        my @h = mce_grep_s { %_ * 3 == 0 } {
29           begin => 1, end => 10000, step => 5, format => undef
30        };
31

DESCRIPTION

33       This module provides a parallel grep implementation via Many-Core
34       Engine.  MCE incurs a small overhead due to passing of data. A fast
35       code block will run faster natively. However, the overhead will likely
36       diminish as the complexity increases for the code.
37
38        my @m1 =     grep { $_ % 5 == 0 } 1..1000000;          ## 0.065 secs
39        my @m2 = mce_grep { $_ % 5 == 0 } 1..1000000;          ## 0.194 secs
40
41       Chunking, enabled by default, greatly reduces the overhead behind the
42       scene.  The time for mce_grep below also includes the time for data
43       exchanges between the manager and worker processes. More
44       parallelization will be seen when the code incurs additional CPU time.
45
46        my @m1 =     grep { /[2357][1468][9]/ } 1..1000000;    ## 0.353 secs
47        my @m2 = mce_grep { /[2357][1468][9]/ } 1..1000000;    ## 0.218 secs
48
49       Even faster is mce_grep_s; useful when input data is a range of
50       numbers.  Workers generate sequences mathematically among themselves
51       without any interaction from the manager process. Two arguments are
52       required for mce_grep_s (begin, end). Step defaults to 1 if begin is
53       smaller than end, otherwise -1.
54
55        my @m3 = mce_grep_s { /[2357][1468][9]/ } 1, 1000000;  ## 0.165 secs
56
57       Although this document is about MCE::Grep, the MCE::Stream module can
58       write results immediately without waiting for all chunks to complete.
59       This is made possible by passing the reference to an array (in this
60       case @m4 and @m5).
61
62        use MCE::Stream default_mode => 'grep';
63
64        my @m4; mce_stream \@m4, sub { /[2357][1468][9]/ }, 1..1000000;
65
66           ## Completed in 0.203 secs. This is amazing considering the
67           ## overhead for passing data between the manager and workers.
68
69        my @m5; mce_stream_s \@m5, sub { /[2357][1468][9]/ }, 1, 1000000;
70
71           ## Completed in 0.120 secs. Like with mce_grep_s, specifying a
72           ## sequence specification turns out to be faster due to lesser
73           ## overhead for the manager process.
74
75       A common scenario is grepping for pattern(s) inside a massive log file.
76       Notice how parallelism increases as complexity increases for the
77       pattern.  Testing was done against a 300 MB file containing 250k lines.
78
79        use MCE::Grep;
80
81        my @m; open my $LOG, "<", "/path/to/log/file" or die "$!\n";
82
83        @m = grep { /pattern/ } <$LOG>;                      ##  0.756 secs
84        @m = grep { /foobar|[2357][1468][9]/ } <$LOG>;       ## 24.681 secs
85
86        ## Parallelism with mce_grep. This involves the manager process
87        ## due to processing a file handle.
88
89        @m = mce_grep { /pattern/ } <$LOG>;                  ##  0.997 secs
90        @m = mce_grep { /foobar|[2357][1468][9]/ } <$LOG>;   ##  7.439 secs
91
92        ## Even faster with mce_grep_f. Workers access the file directly
93        ## with zero interaction from the manager process.
94
95        my $LOG = "/path/to/file";
96        @m = mce_grep_f { /pattern/ } $LOG;                  ##  0.112 secs
97        @m = mce_grep_f { /foobar|[2357][1468][9]/ } $LOG;   ##  6.840 secs
98

PARSING HUGE FILES

100       The MCE::Grep module lacks an optimization for quickly determining if a
101       match is found from not knowing the pattern inside the code block. Use
102       the following snippet as a template to achieve better performance.
103       Also, take a look at examples/egrep.pl, included with the distribution.
104
105        use MCE::Loop;
106
107        MCE::Loop::init {
108           max_workers => 8, use_slurpio => 1
109        };
110
111        my $pattern  = 'karl';
112        my $hugefile = 'very_huge.file';
113
114        my @result = mce_loop_f {
115           my ($mce, $slurp_ref, $chunk_id) = @_;
116
117           ## Quickly determine if a match is found.
118           ## Process slurped chunk only if true.
119
120           if ($$slurp_ref =~ /$pattern/m) {
121              my @matches;
122
123              ## The following is fast on Unix. Performance degrades
124              ## drastically on Windows beyond 4 workers.
125
126              open my $MEM_FH, '<', $slurp_ref;
127              binmode $MEM_FH, ':raw';
128              while (<$MEM_FH>) { push @matches, $_ if (/$pattern/); }
129              close   $MEM_FH;
130
131              ## Therefore, use the following construct on Windows.
132
133              while ( $$slurp_ref =~ /([^\n]+\n)/mg ) {
134                 my $line = $1; # save $1 to not lose the value
135                 push @matches, $line if ($line =~ /$pattern/);
136              }
137
138              ## Gather matched lines.
139
140              MCE->gather(@matches);
141           }
142
143        } $hugefile;
144
145        print join('', @result);
146

OVERRIDING DEFAULTS

148       The following list options which may be overridden when loading the
149       module.
150
151        use Sereal qw( encode_sereal decode_sereal );
152        use CBOR::XS qw( encode_cbor decode_cbor );
153        use JSON::XS qw( encode_json decode_json );
154
155        use MCE::Grep
156            max_workers => 4,                # Default 'auto'
157            chunk_size => 100,               # Default 'auto'
158            tmp_dir => "/path/to/app/tmp",   # $MCE::Signal::tmp_dir
159            freeze => \&encode_sereal,       # \&Storable::freeze
160            thaw => \&decode_sereal          # \&Storable::thaw
161        ;
162
163       From MCE 1.8 onwards, Sereal 3.015+ is loaded automatically if
164       available.  Specify "Sereal =" 0> to use Storable instead.
165
166        use MCE::Grep Sereal => 0;
167

CUSTOMIZING MCE

169       MCE::Grep->init ( options )
170       MCE::Grep::init { options }
171          The init function accepts a hash of MCE options. The gather option,
172          if specified, is ignored due to being used internally by the module.
173
174           use MCE::Grep;
175
176           MCE::Grep::init {
177              chunk_size => 1, max_workers => 4,
178
179              user_begin => sub {
180                 print "## ", MCE->wid, " started\n";
181              },
182
183              user_end => sub {
184                 print "## ", MCE->wid, " completed\n";
185              }
186           };
187
188           my @a = mce_grep { $_ % 5 == 0 } 1..100;
189
190           print "\n", "@a", "\n";
191
192           -- Output
193
194           ## 2 started
195           ## 3 started
196           ## 1 started
197           ## 4 started
198           ## 3 completed
199           ## 4 completed
200           ## 1 completed
201           ## 2 completed
202
203           5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100
204

API DOCUMENTATION

206       MCE::Grep->run ( sub { code }, list )
207       mce_grep { code } list
208          Input data may be defined using a list or an array reference. Unlike
209          MCE::Loop, Flow, and Step, specifying a hash reference as input data
210          isn't allowed.
211
212           my @a = mce_grep { /[2357]/ } 1..1000;
213           my @b = mce_grep { /[2357]/ } \@list;
214
215           my @z = mce_grep { /[2357]/ } \%hash;  # not supported
216
217       MCE::Grep->run_file ( sub { code }, file )
218       mce_grep_f { code } file
219          The fastest of these is the /path/to/file. Workers communicate the
220          next offset position among themselves with zero interaction by the
221          manager process.
222
223           my @c = mce_grep_f { /pattern/ } "/path/to/file";  # faster
224           my @d = mce_grep_f { /pattern/ } $file_handle;
225           my @e = mce_grep_f { /pattern/ } \$scalar;
226
227       MCE::Grep->run_seq ( sub { code }, $beg, $end [, $step, $fmt ] )
228       mce_grep_s { code } $beg, $end [, $step, $fmt ]
229          Sequence may be defined as a list, an array reference, or a hash
230          reference.  The functions require both begin and end values to run.
231          Step and format are optional. The format is passed to sprintf (% may
232          be omitted below).
233
234           my ($beg, $end, $step, $fmt) = (10, 20, 0.1, "%4.1f");
235
236           my @f = mce_grep_s { /[1234]\.[5678]/ } $beg, $end, $step, $fmt;
237           my @g = mce_grep_s { /[1234]\.[5678]/ } [ $beg, $end, $step, $fmt ];
238
239           my @h = mce_grep_s { /[1234]\.[5678]/ } {
240              begin => $beg, end => $end,
241              step => $step, format => $fmt
242           };
243
244       MCE::Grep->run ( sub { code }, iterator )
245       mce_grep { code } iterator
246          An iterator reference may be specified for input_data. Iterators are
247          described under section "SYNTAX for INPUT_DATA" at MCE::Core.
248
249           my @a = mce_grep { $_ % 3 == 0 } make_iterator(10, 30, 2);
250

MANUAL SHUTDOWN

252       MCE::Grep->finish
253       MCE::Grep::finish
254          Workers remain persistent as much as possible after running.
255          Shutdown occurs automatically when the script terminates. Call
256          finish when workers are no longer needed.
257
258           use MCE::Grep;
259
260           MCE::Grep::init {
261              chunk_size => 20, max_workers => 'auto'
262           };
263
264           my @a = mce_grep { ... } 1..100;
265
266           MCE::Grep::finish;
267

INDEX

269       MCE, MCE::Core
270

AUTHOR

272       Mario E. Roy, <marioeroy AT gmail DOT com>
273
274
275
276perl v5.28.0                      2018-08-25                      MCE::Grep(3)