File Coverage

File:C4/Matcher.pm
Coverage:7.2%

linestmtbrancondsubtimecode
1package C4::Matcher;
2
3# Copyright (C) 2007 LibLime
4#
5# This file is part of Koha.
6#
7# Koha is free software; you can redistribute it and/or modify it under the
8# terms of the GNU General Public License as published by the Free Software
9# Foundation; either version 2 of the License, or (at your option) any later
10# version.
11#
12# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License along
17# with Koha; if not, write to the Free Software Foundation, Inc.,
18# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19
20
2
2
2
550
4
43
use strict;
21
2
2
2
9
3
89
use warnings;
22
23
2
2
2
221
31
32
use C4::Context;
24
2
2
2
127
5733
115
use MARC::Record;
25
2
2
2
159
4
309
use C4::Search;
26
2
2
2
9
3
935
use C4::Biblio;
27
28
2
2
2
10
3
99
use vars qw($VERSION);
29
30BEGIN {
31        # set the version for version checking
32
2
8541
        $VERSION = 3.01;
33}
34
35 - 71
=head1 NAME

C4::Matcher - find MARC records matching another one

=head1 SYNOPSIS

  my @matchers = C4::Matcher::GetMatcherList();

  my $matcher = C4::Matcher->new($record_type);
  $matcher->threshold($threshold);
  $matcher->code($code);
  $matcher->description($description);

  $matcher->add_simple_matchpoint('isbn', 1000, '020', 'a', -1, 0, '');
  $matcher->add_simple_matchpoint('Date', 1000, '008', '', 7, 4, '');
  $matcher->add_matchpoint('isbn', 1000, [ { tag => '020', subfields => 'a', norms => [] } ]);

  $matcher->add_simple_required_check('245', 'a', -1, 0, '', '245', 'a', -1, 0, '');
  $matcher->add_required_check([ { tag => '245', subfields => 'a', norms => [] } ], 
                               [ { tag => '245', subfields => 'a', norms => [] } ]);

  my @matches = $matcher->get_matches($marc_record, $max_matches);

  foreach $match (@matches) {

      # matches already sorted in order of
      # decreasing score
      print "record ID: $match->{'record_id'};
      print "score:     $match->{'score'};

  }

  my $matcher_description = $matcher->dump();

=head1 FUNCTIONS

=cut
72
73 - 84
=head2 GetMatcherList

  my @matchers = C4::Matcher::GetMatcherList();

Returns an array of hashrefs list all matchers
present in the database.  Each hashref includes:

 * matcher_id
 * code
 * description

=cut
85
86sub GetMatcherList {
87
0
    my $dbh = C4::Context->dbh;
88
89
0
    my $sth = $dbh->prepare_cached("SELECT matcher_id, code, description FROM marc_matchers ORDER BY matcher_id");
90
0
    $sth->execute();
91
0
    my @results = ();
92
0
    while (my $row = $sth->fetchrow_hashref) {
93
0
        push @results, $row;
94    }
95
0
    return @results;
96}
97
98 - 109
=head1 METHODS

=head2 new

  my $matcher = C4::Matcher->new($record_type, $threshold);

Creates a new Matcher.  C<$record_type> indicates which search
database to use, e.g., 'biblio' or 'authority' and defaults to
'biblio', while C<$threshold> is the minimum score required for a match
and defaults to 1000.

=cut
110
111sub new {
112
0
    my $class = shift;
113
0
    my $self = {};
114
115
0
    $self->{'id'} = undef;
116
117
0
    if ($#_ > -1) {
118
0
        $self->{'record_type'} = shift;
119    } else {
120
0
        $self->{'record_type'} = 'biblio';
121    }
122
123
0
    if ($#_ > -1) {
124
0
        $self->{'threshold'} = shift;
125    } else {
126
0
        $self->{'threshold'} = 1000;
127    }
128
129
0
    $self->{'code'} = '';
130
0
    $self->{'description'} = '';
131
132
0
    $self->{'matchpoints'} = [];
133
0
    $self->{'required_checks'} = [];
134
135
0
    bless $self, $class;
136
0
    return $self;
137}
138
139 - 147
=head2 fetch

  my $matcher = C4::Matcher->fetch($id);

Creates a matcher object from the version stored
in the database.  If a matcher with the given
id does not exist, returns undef.

=cut
148
149sub fetch {
150
0
    my $class = shift;
151
0
    my $id = shift;
152
0
    my $dbh = C4::Context->dbh();
153
154
0
    my $sth = $dbh->prepare_cached("SELECT * FROM marc_matchers WHERE matcher_id = ?");
155
0
    $sth->execute($id);
156
0
    my $row = $sth->fetchrow_hashref;
157
0
    $sth->finish();
158
0
    return undef unless defined $row;
159
160
0
    my $self = {};
161
0
    $self->{'id'} = $row->{'matcher_id'};
162
0
    $self->{'record_type'} = $row->{'record_type'};
163
0
    $self->{'code'} = $row->{'code'};
164
0
    $self->{'description'} = $row->{'description'};
165
0
    $self->{'threshold'} = int($row->{'threshold'});
166
0
    bless $self, $class;
167
168    # matchpoints
169
0
    $self->{'matchpoints'} = [];
170
0
    $sth = $dbh->prepare_cached("SELECT * FROM matcher_matchpoints WHERE matcher_id = ? ORDER BY matchpoint_id");
171
0
    $sth->execute($self->{'id'});
172
0
    while (my $row = $sth->fetchrow_hashref) {
173
0
        my $matchpoint = $self->_fetch_matchpoint($row->{'matchpoint_id'});
174
0
0
        push @{ $self->{'matchpoints'} }, $matchpoint;
175    }
176
177    # required checks
178
0
    $self->{'required_checks'} = [];
179
0
    $sth = $dbh->prepare_cached("SELECT * FROM matchchecks WHERE matcher_id = ? ORDER BY matchcheck_id");
180
0
    $sth->execute($self->{'id'});
181
0
    while (my $row = $sth->fetchrow_hashref) {
182
0
        my $source_matchpoint = $self->_fetch_matchpoint($row->{'source_matchpoint_id'});
183
0
        my $target_matchpoint = $self->_fetch_matchpoint($row->{'target_matchpoint_id'});
184
0
        my $matchcheck = {};
185
0
        $matchcheck->{'source_matchpoint'} = $source_matchpoint;
186
0
        $matchcheck->{'target_matchpoint'} = $target_matchpoint;
187
0
0
        push @{ $self->{'required_checks'} }, $matchcheck;
188    }
189
190
0
    return $self;
191}
192
193sub _fetch_matchpoint {
194
0
    my $self = shift;
195
0
    my $matchpoint_id = shift;
196
197
0
    my $dbh = C4::Context->dbh;
198
0
    my $sth = $dbh->prepare_cached("SELECT * FROM matchpoints WHERE matchpoint_id = ?");
199
0
    $sth->execute($matchpoint_id);
200
0
    my $row = $sth->fetchrow_hashref;
201
0
    my $matchpoint = {};
202
0
    $matchpoint->{'index'} = $row->{'search_index'};
203
0
    $matchpoint->{'score'} = int($row->{'score'});
204
0
    $sth->finish();
205
206
0
    $matchpoint->{'components'} = [];
207
0
    $sth = $dbh->prepare_cached("SELECT * FROM matchpoint_components WHERE matchpoint_id = ? ORDER BY sequence");
208
0
    $sth->execute($matchpoint_id);
209
0
    while ($row = $sth->fetchrow_hashref) {
210
0
        my $component = {};
211
0
        $component->{'tag'} = $row->{'tag'};
212
0
0
        $component->{'subfields'} = { map { $_ => 1 } split(//, $row->{'subfields'}) };
213
0
        $component->{'offset'} = int($row->{'offset'});
214
0
        $component->{'length'} = int($row->{'length'});
215
0
        $component->{'norms'} = [];
216
0
        my $sth2 = $dbh->prepare_cached("SELECT *
217                                         FROM matchpoint_component_norms
218                                         WHERE matchpoint_component_id = ? ORDER BY sequence");
219
0
        $sth2->execute($row->{'matchpoint_component_id'});
220
0
        while (my $row2 = $sth2->fetchrow_hashref) {
221
0
0
            push @{ $component->{'norms'} }, $row2->{'norm_routine'};
222        }
223
0
0
        push @{ $matchpoint->{'components'} }, $component;
224    }
225
0
    return $matchpoint;
226}
227
228 - 238
=head2 store

  my $id = $matcher->store();

Stores matcher in database.  The return value is the ID 
of the marc_matchers row.  If the matcher was 
previously retrieved from the database via the fetch()
method, the DB representation of the matcher
is replaced.

=cut
239
240sub store {
241
0
    my $self = shift;
242
243
0
    if (defined $self->{'id'}) {
244        # update
245
0
        $self->_del_matcher_components();
246
0
        $self->_update_marc_matchers();
247    } else {
248        # create new
249
0
        $self->_new_marc_matchers();
250    }
251
0
    $self->_store_matcher_components();
252
0
    return $self->{'id'};
253}
254
255sub _del_matcher_components {
256
0
    my $self = shift;
257
258
0
    my $dbh = C4::Context->dbh();
259
0
    my $sth = $dbh->prepare_cached("DELETE FROM matchpoints WHERE matcher_id = ?");
260
0
    $sth->execute($self->{'id'});
261
0
    $sth = $dbh->prepare_cached("DELETE FROM matchchecks WHERE matcher_id = ?");
262
0
    $sth->execute($self->{'id'});
263    # foreign key delete cascades take care of deleting relevant rows
264    # from matcher_matchpoints, matchpoint_components, and
265    # matchpoint_component_norms
266}
267
268sub _update_marc_matchers {
269
0
    my $self = shift;
270
271
0
    my $dbh = C4::Context->dbh();
272
0
    my $sth = $dbh->prepare_cached("UPDATE marc_matchers
273                                    SET code = ?,
274                                        description = ?,
275                                        record_type = ?,
276                                        threshold = ?
277                                    WHERE matcher_id = ?");
278
0
    $sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'}, $self->{'id'});
279}
280
281sub _new_marc_matchers {
282
0
    my $self = shift;
283
284
0
    my $dbh = C4::Context->dbh();
285
0
    my $sth = $dbh->prepare_cached("INSERT INTO marc_matchers
286                                    (code, description, record_type, threshold)
287                                    VALUES (?, ?, ?, ?)");
288
0
    $sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'});
289
0
    $self->{'id'} = $dbh->{'mysql_insertid'};
290}
291
292sub _store_matcher_components {
293
0
    my $self = shift;
294
295
0
    my $dbh = C4::Context->dbh();
296
0
    my $sth;
297
0
    my $matcher_id = $self->{'id'};
298
0
0
    foreach my $matchpoint (@{ $self->{'matchpoints'}}) {
299
0
        my $matchpoint_id = $self->_store_matchpoint($matchpoint);
300
0
        $sth = $dbh->prepare_cached("INSERT INTO matcher_matchpoints (matcher_id, matchpoint_id)
301                                     VALUES (?, ?)");
302
0
        $sth->execute($matcher_id, $matchpoint_id);
303    }
304
0
0
    foreach my $matchcheck (@{ $self->{'required_checks'} }) {
305
0
        my $source_matchpoint_id = $self->_store_matchpoint($matchcheck->{'source_matchpoint'});
306
0
        my $target_matchpoint_id = $self->_store_matchpoint($matchcheck->{'target_matchpoint'});
307
0
        $sth = $dbh->prepare_cached("INSERT INTO matchchecks
308                                     (matcher_id, source_matchpoint_id, target_matchpoint_id)
309                                     VALUES (?, ?, ?)");
310
0
        $sth->execute($matcher_id, $source_matchpoint_id, $target_matchpoint_id);
311    }
312
313}
314
315sub _store_matchpoint {
316
0
    my $self = shift;
317
0
    my $matchpoint = shift;
318
319
0
    my $dbh = C4::Context->dbh();
320
0
    my $sth;
321
0
    my $matcher_id = $self->{'id'};
322
0
    $sth = $dbh->prepare_cached("INSERT INTO matchpoints (matcher_id, search_index, score)
323                                 VALUES (?, ?, ?)");
324
0
    $sth->execute($matcher_id, $matchpoint->{'index'}, $matchpoint->{'score'});
325
0
    my $matchpoint_id = $dbh->{'mysql_insertid'};
326
0
    my $seqnum = 0;
327
0
0
    foreach my $component (@{ $matchpoint->{'components'} }) {
328
0
        $seqnum++;
329
0
        $sth = $dbh->prepare_cached("INSERT INTO matchpoint_components
330                                     (matchpoint_id, sequence, tag, subfields, offset, length)
331                                     VALUES (?, ?, ?, ?, ?, ?)");
332
0
        $sth->bind_param(1, $matchpoint_id);
333
0
        $sth->bind_param(2, $seqnum);
334
0
        $sth->bind_param(3, $component->{'tag'});
335
0
0
        $sth->bind_param(4, join "", sort keys %{ $component->{'subfields'} });
336
0
        $sth->bind_param(5, $component->{'offset'});
337
0
        $sth->bind_param(6, $component->{'length'});
338
0
        $sth->execute();
339
0
        my $matchpoint_component_id = $dbh->{'mysql_insertid'};
340
0
        my $normseq = 0;
341
0
0
        foreach my $norm (@{ $component->{'norms'} }) {
342
0
            $normseq++;
343
0
            $sth = $dbh->prepare_cached("INSERT INTO matchpoint_component_norms
344                                         (matchpoint_component_id, sequence, norm_routine)
345                                         VALUES (?, ?, ?)");
346
0
            $sth->execute($matchpoint_component_id, $normseq, $norm);
347        }
348    }
349
0
    return $matchpoint_id;
350}
351
352
353 - 360
=head2 delete

  C4::Matcher->delete($id);

Deletes the matcher of the specified ID
from the database.

=cut
361
362sub delete {
363
0
    my $class = shift;
364
0
    my $matcher_id = shift;
365
366
0
    my $dbh = C4::Context->dbh;
367
0
    my $sth = $dbh->prepare("DELETE FROM marc_matchers WHERE matcher_id = ?");
368
0
    $sth->execute($matcher_id); # relying on cascading deletes to clean up everything
369}
370
371 - 378
=head2 threshold

  $matcher->threshold(1000);
  my $threshold = $matcher->threshold();

Accessor method.

=cut
379
380sub threshold {
381
0
    my $self = shift;
382
0
    @_ ? $self->{'threshold'} = shift : $self->{'threshold'};
383}
384
385 - 394
=head2 _id

  $matcher->_id(123);
  my $id = $matcher->_id();

Accessor method.  Note that using this method
to set the DB ID of the matcher should not be
done outside of the editing CGI.

=cut
395
396sub _id {
397
0
    my $self = shift;
398
0
    @_ ? $self->{'id'} = shift : $self->{'id'};
399}
400
401 - 408
=head2 code

  $matcher->code('ISBN');
  my $code = $matcher->code();

Accessor method.

=cut
409
410sub code {
411
0
    my $self = shift;
412
0
    @_ ? $self->{'code'} = shift : $self->{'code'};
413}
414
415 - 422
=head2 description

  $matcher->description('match on ISBN');
  my $description = $matcher->description();

Accessor method.

=cut
423
424sub description {
425
0
    my $self = shift;
426
0
    @_ ? $self->{'description'} = shift : $self->{'description'};
427}
428
429 - 451
=head2 add_matchpoint

  $matcher->add_matchpoint($index, $score, $matchcomponents);

Adds a matchpoint that may include multiple components.  The $index
parameter identifies the index that will be searched, while $score
is the weight that will be added if a match is found.

$matchcomponents should be a reference to an array of matchpoint
compoents, each of which should be a hash containing the following 
keys:
    tag
    subfields
    offset
    length
    norms

The normalization_rules value should in turn be a reference to an
array, each element of which should be a reference to a 
normalization subroutine (under C4::Normalize) to be applied
to the source string.

=cut
452
453sub add_matchpoint {
454
0
    my $self = shift;
455
0
    my ($index, $score, $matchcomponents) = @_;
456
457
0
    my $matchpoint = {};
458
0
    $matchpoint->{'index'} = $index;
459
0
    $matchpoint->{'score'} = $score;
460
0
    $matchpoint->{'components'} = [];
461
0
0
    foreach my $input_component (@{ $matchcomponents }) {
462
0
0
        push @{ $matchpoint->{'components'} }, _parse_match_component($input_component);
463    }
464
0
0
    push @{ $self->{'matchpoints'} }, $matchpoint;
465}
466
467 - 478
=head2 add_simple_matchpoint

  $matcher->add_simple_matchpoint($index, $score, $source_tag,
                            $source_subfields, $source_offset, 
                            $source_length, $source_normalizer);


Adds a simple matchpoint rule -- after composing a key based on the source tag and subfields,
normalized per the normalization fuction, search the index.  All records retrieved
will receive the assigned score.

=cut
479
480sub add_simple_matchpoint {
481
0
    my $self = shift;
482
0
    my ($index, $score, $source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer) = @_;
483
484
0
    $self->add_matchpoint($index, $score, [
485                          { tag => $source_tag, subfields => $source_subfields,
486                            offset => $source_offset, 'length' => $source_length,
487                            norms => [ $source_normalizer ]
488                          }
489                         ]);
490}
491
492 - 522
=head2 add_required_check

  $match->add_required_check($source_matchpoint, $target_matchpoint);

Adds a required check definition.  A required check means that in 
order for a match to be considered valid, the key derived from the
source (incoming) record must match the key derived from the target
(already in DB) record.

Unlike a regular matchpoint, only the first repeat of each tag 
in the source and target match criteria are considered.

A typical example of a required check would be verifying that the
titles and publication dates match.

$source_matchpoint and $target_matchpoint are each a reference to
an array of hashes, where each hash follows the same definition
as the matchpoint component specification in add_matchpoint, i.e.,

    tag
    subfields
    offset
    length
    norms

The normalization_rules value should in turn be a reference to an
array, each element of which should be a reference to a 
normalization subroutine (under C4::Normalize) to be applied
to the source string.

=cut
523
524sub add_required_check {
525
0
    my $self = shift;
526
0
    my ($source_matchpoint, $target_matchpoint) = @_;
527
528
0
    my $matchcheck = {};
529
0
    $matchcheck->{'source_matchpoint'}->{'index'} = '';
530
0
    $matchcheck->{'source_matchpoint'}->{'score'} = 0;
531
0
    $matchcheck->{'source_matchpoint'}->{'components'} = [];
532
0
    $matchcheck->{'target_matchpoint'}->{'index'} = '';
533
0
    $matchcheck->{'target_matchpoint'}->{'score'} = 0;
534
0
    $matchcheck->{'target_matchpoint'}->{'components'} = [];
535
0
0
    foreach my $input_component (@{ $source_matchpoint }) {
536
0
0
        push @{ $matchcheck->{'source_matchpoint'}->{'components'} }, _parse_match_component($input_component);
537    }
538
0
0
    foreach my $input_component (@{ $target_matchpoint }) {
539
0
0
        push @{ $matchcheck->{'target_matchpoint'}->{'components'} }, _parse_match_component($input_component);
540    }
541
0
0
    push @{ $self->{'required_checks'} }, $matchcheck;
542}
543
544 - 554
=head2 add_simple_required_check

  $matcher->add_simple_required_check($source_tag, $source_subfields,
                $source_offset, $source_length, $source_normalizer, 
                $target_tag, $target_subfields, $target_offset, 
                $target_length, $target_normalizer);

Adds a required check, which requires that the normalized keys made from the source and targets
must match for a match to be considered valid.

=cut
555
556sub add_simple_required_check {
557
0
    my $self = shift;
558
0
    my ($source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer,
559        $target_tag, $target_subfields, $target_offset, $target_length, $target_normalizer) = @_;
560
561
0
    $self->add_required_check(
562      [ { tag => $source_tag, subfields => $source_subfields, offset => $source_offset, 'length' => $source_length,
563          norms => [ $source_normalizer ] } ],
564      [ { tag => $target_tag, subfields => $target_subfields, offset => $target_offset, 'length' => $target_length,
565          norms => [ $target_normalizer ] } ]
566    );
567}
568
569 - 593
=head2 find_matches

  my @matches = $matcher->get_matches($marc_record, $max_matches);
  foreach $match (@matches) {
      # matches already sorted in order of
      # decreasing score
      print "record ID: $match->{'record_id'};
      print "score:     $match->{'score'};
  }

Identifies all of the records matching the given MARC record.  For a record already 
in the database to be considered a match, it must meet the following criteria:

=over 2

=item 1. Total score from its matching field must exceed the supplied threshold.

=item 2. It must pass all required checks.

=back

Only the top $max_matches matches are returned.  The returned array is sorted
in order of decreasing score, i.e., the best match is first.

=cut
594
595sub get_matches {
596
0
    my $self = shift;
597
0
    my ($source_record, $max_matches) = @_;
598
599
0
    my %matches = ();
600
601
0
0
    foreach my $matchpoint (@{ $self->{'matchpoints'} }) {
602
0
        my @source_keys = _get_match_keys($source_record, $matchpoint);
603
0
        next if scalar(@source_keys) == 0;
604        # build query
605
0
0
        my $query = join(" or ", map { "$matchpoint->{'index'}=$_" } @source_keys);
606        # FIXME only searching biblio index at the moment
607
0
        my ($error, $searchresults, $total_hits) = SimpleSearch($query, 0, $max_matches);
608
609
0
        if (defined $error ) {
610
0
            warn "search failed ($query) $error";
611        } else {
612
0
0
            foreach my $matched (@{$searchresults}) {
613
0
                $matches{$matched} += $matchpoint->{'score'};
614            }
615        }
616    }
617
618    # get rid of any that don't meet the threshold
619
0
0
    %matches = map { ($matches{$_} >= $self->{'threshold'}) ? ($_ => $matches{$_}) : () } keys %matches;
620
621    # get rid of any that don't meet the required checks
622
0
0
    %matches = map { _passes_required_checks($source_record, $_, $self->{'required_checks'}) ? ($_ => $matches{$_}) : () }
623                keys %matches;
624
625
0
    my @results = ();
626
0
    foreach my $marcblob (keys %matches) {
627
0
        my $target_record = MARC::Record->new_from_usmarc($marcblob);
628
0
        my $result = TransformMarcToKoha(C4::Context->dbh, $target_record, '');
629        # FIXME - again, bibliospecific
630        # also, can search engine be induced to give just the number in the first place?
631
0
        my $record_number = $result->{'biblionumber'};
632
0
        push @results, { 'record_id' => $record_number, 'score' => $matches{$marcblob} };
633    }
634
0
0
    @results = sort { $b->{'score'} cmp $a->{'score'} } @results;
635
0
    if (scalar(@results) > $max_matches) {
636
0
        @results = @results[0..$max_matches-1];
637    }
638
0
    return @results;
639
640}
641
642 - 650
=head2 dump

  $description = $matcher->dump();

Returns a reference to a structure containing all of the information
in the matcher object.  This is mainly a convenience method to
aid setting up a HTML editing form.

=cut
651
652sub dump {
653
0
    my $self = shift;
654
655
0
    my $result = {};
656
657
0
    $result->{'matcher_id'} = $self->{'id'};
658
0
    $result->{'code'} = $self->{'code'};
659
0
    $result->{'description'} = $self->{'description'};
660
661
0
    $result->{'matchpoints'} = [];
662
0
0
    foreach my $matchpoint (@{ $self->{'matchpoints'} }) {
663
0
0
        push @{ $result->{'matchpoints'} }, $matchpoint;
664    }
665
0
    $result->{'matchchecks'} = [];
666
0
0
    foreach my $matchcheck (@{ $self->{'required_checks'} }) {
667
0
0
        push @{ $result->{'matchchecks'} }, $matchcheck;
668    }
669
670
0
    return $result;
671}
672
673sub _passes_required_checks {
674
0
    my ($source_record, $target_blob, $matchchecks) = @_;
675
0
    my $target_record = MARC::Record->new_from_usmarc($target_blob); # FIXME -- need to avoid parsing record twice
676
677    # no checks supplied == automatic pass
678
0
0
    return 1 if $#{ $matchchecks } == -1;
679
680
0
0
    foreach my $matchcheck (@{ $matchchecks }) {
681
0
        my $source_key = join "", _get_match_keys($source_record, $matchcheck->{'source_matchpoint'});
682
0
        my $target_key = join "", _get_match_keys($target_record, $matchcheck->{'target_matchpoint'});
683
0
        return 0 unless $source_key eq $target_key;
684    }
685
0
    return 1;
686}
687
688sub _get_match_keys {
689
0
    my $source_record = shift;
690
0
    my $matchpoint = shift;
691
0
    my $check_only_first_repeat = @_ ? shift : 0;
692
693    # If there is more than one component to the matchpoint (e.g.,
694    # matchpoint includes both 003 and 001), any repeats
695    # of the first component's tag are identified; repeats
696    # of the subsequent components' tags are appended to
697    # each parallel key dervied from the first component,
698    # up to the number of repeats of the first component's tag.
699    #
700    # For example, if the record has one 003 and two 001s, only
701    # one key is retrieved because there is only one 003. The key
702    # will consist of the contents of the first 003 and first 001.
703    #
704    # If there are two 003s and two 001s, there will be two keys:
705    # first 003 + first 001
706    # second 003 + second 001
707
708
0
    my @keys = ();
709
0
    for (my $i = 0; $i <= $#{ $matchpoint->{'components'} }; $i++) {
710
0
        my $component = $matchpoint->{'components'}->[$i];
711
0
        my $j = -1;
712
0
        FIELD: foreach my $field ($source_record->field($component->{'tag'})) {
713
0
            $j++;
714
0
            last FIELD if $j > 0 and $check_only_first_repeat;
715
0
            last FIELD if $i > 0 and $j > $#keys;
716
0
            my $key = "";
717
0
                        my $string;
718
0
            if ($field->is_control_field()) {
719
0
                                $string=$field->data();
720            } else {
721
0
                foreach my $subfield ($field->subfields()) {
722
0
                    if (exists $component->{'subfields'}->{$subfield->[0]}) {
723
0
                        $string .= " " . $subfield->[1];
724                    }
725                }
726                        }
727
0
            if ($component->{'length'}>0) {
728
0
                    $string= substr($string, $component->{'offset'}, $component->{'length'});
729                            # FIXME normalize, substr
730            } elsif ($component->{'offset'}) {
731
0
                    $string= substr($string, $component->{'offset'});
732            }
733
0
            $key = _normalize($string);
734
0
            if ($i == 0) {
735
0
                push @keys, $key if $key;
736            } else {
737
0
                $keys[$j] .= " $key" if $key;
738            }
739        }
740
0
    }
741
0
    return @keys;
742}
743
744
745sub _parse_match_component {
746
0
    my $input_component = shift;
747
748
0
    my $component = {};
749
0
    $component->{'tag'} = $input_component->{'tag'};
750
0
0
    $component->{'subfields'} = { map { $_ => 1 } split(//, $input_component->{'subfields'}) };
751
0
    $component->{'offset'} = exists($input_component->{'offset'}) ? $input_component->{'offset'} : -1;
752
0
    $component->{'length'} = $input_component->{'length'} ? $input_component->{'length'} : 0;
753
0
    $component->{'norms'} = $input_component->{'norms'} ? $input_component->{'norms'} : [];
754
755
0
    return $component;
756}
757
758# FIXME - default normalizer
759sub _normalize {
760
0
    my $value = uc shift;
761
0
    $value =~ s/[.;:,\]\[\)\(\/'"]//g;
762
0
    $value =~ s/^\s+//;
763    #$value =~ s/^\s+$//;
764
0
    $value =~ s/\s+$//;
765
0
    $value =~ s/\s+/ /g;
766    #$value =~ s/[.;,\]\[\)\(\/"']//g;
767
0
    return $value;
768}
769
7701;