| File: | C4/Matcher.pm |
| Coverage: | 7.2% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | package C4::Matcher; | |||||
| 2 | ||||||
| 3 | # Copyright (C) 2007 LibLime | |||||
| 4 | # | |||||
| 5 | # This file is part of Koha. | |||||
| 6 | # | |||||
| 7 | # Koha is free software; you can redistribute it and/or modify it under the | |||||
| 8 | # terms of the GNU General Public License as published by the Free Software | |||||
| 9 | # Foundation; either version 2 of the License, or (at your option) any later | |||||
| 10 | # version. | |||||
| 11 | # | |||||
| 12 | # Koha is distributed in the hope that it will be useful, but WITHOUT ANY | |||||
| 13 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR | |||||
| 14 | # A PARTICULAR PURPOSE. See the GNU General Public License for more details. | |||||
| 15 | # | |||||
| 16 | # You should have received a copy of the GNU General Public License along | |||||
| 17 | # with Koha; if not, write to the Free Software Foundation, Inc., | |||||
| 18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | |||||
| 19 | ||||||
| 20 | 4 4 4 | 45581 34 92 | use strict; | |||
| 21 | 4 4 4 | 18 46 122 | use warnings; | |||
| 22 | ||||||
| 23 | 4 4 4 | 367 38 75 | use C4::Context; | |||
| 24 | 4 4 4 | 474 20753 287 | use MARC::Record; | |||
| 25 | 4 4 4 | 463 44 771 | use C4::Search; | |||
| 26 | 4 4 4 | 49 31 2037 | use C4::Biblio; | |||
| 27 | ||||||
| 28 | 4 4 4 | 33 21 145 | use vars qw($VERSION); | |||
| 29 | ||||||
| 30 | BEGIN { | |||||
| 31 | # set the version for version checking | |||||
| 32 | 4 | 16543 | $VERSION = 3.01; | |||
| 33 | } | |||||
| 34 | ||||||
| 35 - 71 | =head1 NAME
C4::Matcher - find MARC records matching another one
=head1 SYNOPSIS
my @matchers = C4::Matcher::GetMatcherList();
my $matcher = C4::Matcher->new($record_type);
$matcher->threshold($threshold);
$matcher->code($code);
$matcher->description($description);
$matcher->add_simple_matchpoint('isbn', 1000, '020', 'a', -1, 0, '');
$matcher->add_simple_matchpoint('Date', 1000, '008', '', 7, 4, '');
$matcher->add_matchpoint('isbn', 1000, [ { tag => '020', subfields => 'a', norms => [] } ]);
$matcher->add_simple_required_check('245', 'a', -1, 0, '', '245', 'a', -1, 0, '');
$matcher->add_required_check([ { tag => '245', subfields => 'a', norms => [] } ],
[ { tag => '245', subfields => 'a', norms => [] } ]);
my @matches = $matcher->get_matches($marc_record, $max_matches);
foreach $match (@matches) {
# matches already sorted in order of
# decreasing score
print "record ID: $match->{'record_id'};
print "score: $match->{'score'};
}
my $matcher_description = $matcher->dump();
=head1 FUNCTIONS
=cut | |||||
| 72 | ||||||
| 73 - 84 | =head2 GetMatcherList my @matchers = C4::Matcher::GetMatcherList(); Returns an array of hashrefs list all matchers present in the database. Each hashref includes: * matcher_id * code * description =cut | |||||
| 85 | ||||||
| 86 | sub GetMatcherList { | |||||
| 87 | 0 | my $dbh = C4::Context->dbh; | ||||
| 88 | ||||||
| 89 | 0 | my $sth = $dbh->prepare_cached("SELECT matcher_id, code, description FROM marc_matchers ORDER BY matcher_id"); | ||||
| 90 | 0 | $sth->execute(); | ||||
| 91 | 0 | my @results = (); | ||||
| 92 | 0 | while (my $row = $sth->fetchrow_hashref) { | ||||
| 93 | 0 | push @results, $row; | ||||
| 94 | } | |||||
| 95 | 0 | return @results; | ||||
| 96 | } | |||||
| 97 | ||||||
| 98 - 109 | =head1 METHODS =head2 new my $matcher = C4::Matcher->new($record_type, $threshold); Creates a new Matcher. C<$record_type> indicates which search database to use, e.g., 'biblio' or 'authority' and defaults to 'biblio', while C<$threshold> is the minimum score required for a match and defaults to 1000. =cut | |||||
| 110 | ||||||
| 111 | sub new { | |||||
| 112 | 0 | my $class = shift; | ||||
| 113 | 0 | my $self = {}; | ||||
| 114 | ||||||
| 115 | 0 | $self->{'id'} = undef; | ||||
| 116 | ||||||
| 117 | 0 | if ($#_ > -1) { | ||||
| 118 | 0 | $self->{'record_type'} = shift; | ||||
| 119 | } else { | |||||
| 120 | 0 | $self->{'record_type'} = 'biblio'; | ||||
| 121 | } | |||||
| 122 | ||||||
| 123 | 0 | if ($#_ > -1) { | ||||
| 124 | 0 | $self->{'threshold'} = shift; | ||||
| 125 | } else { | |||||
| 126 | 0 | $self->{'threshold'} = 1000; | ||||
| 127 | } | |||||
| 128 | ||||||
| 129 | 0 | $self->{'code'} = ''; | ||||
| 130 | 0 | $self->{'description'} = ''; | ||||
| 131 | ||||||
| 132 | 0 | $self->{'matchpoints'} = []; | ||||
| 133 | 0 | $self->{'required_checks'} = []; | ||||
| 134 | ||||||
| 135 | 0 | bless $self, $class; | ||||
| 136 | 0 | return $self; | ||||
| 137 | } | |||||
| 138 | ||||||
| 139 - 147 | =head2 fetch my $matcher = C4::Matcher->fetch($id); Creates a matcher object from the version stored in the database. If a matcher with the given id does not exist, returns undef. =cut | |||||
| 148 | ||||||
| 149 | sub fetch { | |||||
| 150 | 0 | my $class = shift; | ||||
| 151 | 0 | my $id = shift; | ||||
| 152 | 0 | my $dbh = C4::Context->dbh(); | ||||
| 153 | ||||||
| 154 | 0 | my $sth = $dbh->prepare_cached("SELECT * FROM marc_matchers WHERE matcher_id = ?"); | ||||
| 155 | 0 | $sth->execute($id); | ||||
| 156 | 0 | my $row = $sth->fetchrow_hashref; | ||||
| 157 | 0 | $sth->finish(); | ||||
| 158 | 0 | return undef unless defined $row; | ||||
| 159 | ||||||
| 160 | 0 | my $self = {}; | ||||
| 161 | 0 | $self->{'id'} = $row->{'matcher_id'}; | ||||
| 162 | 0 | $self->{'record_type'} = $row->{'record_type'}; | ||||
| 163 | 0 | $self->{'code'} = $row->{'code'}; | ||||
| 164 | 0 | $self->{'description'} = $row->{'description'}; | ||||
| 165 | 0 | $self->{'threshold'} = int($row->{'threshold'}); | ||||
| 166 | 0 | bless $self, $class; | ||||
| 167 | ||||||
| 168 | # matchpoints | |||||
| 169 | 0 | $self->{'matchpoints'} = []; | ||||
| 170 | 0 | $sth = $dbh->prepare_cached("SELECT * FROM matcher_matchpoints WHERE matcher_id = ? ORDER BY matchpoint_id"); | ||||
| 171 | 0 | $sth->execute($self->{'id'}); | ||||
| 172 | 0 | while (my $row = $sth->fetchrow_hashref) { | ||||
| 173 | 0 | my $matchpoint = $self->_fetch_matchpoint($row->{'matchpoint_id'}); | ||||
| 174 | 0 0 | push @{ $self->{'matchpoints'} }, $matchpoint; | ||||
| 175 | } | |||||
| 176 | ||||||
| 177 | # required checks | |||||
| 178 | 0 | $self->{'required_checks'} = []; | ||||
| 179 | 0 | $sth = $dbh->prepare_cached("SELECT * FROM matchchecks WHERE matcher_id = ? ORDER BY matchcheck_id"); | ||||
| 180 | 0 | $sth->execute($self->{'id'}); | ||||
| 181 | 0 | while (my $row = $sth->fetchrow_hashref) { | ||||
| 182 | 0 | my $source_matchpoint = $self->_fetch_matchpoint($row->{'source_matchpoint_id'}); | ||||
| 183 | 0 | my $target_matchpoint = $self->_fetch_matchpoint($row->{'target_matchpoint_id'}); | ||||
| 184 | 0 | my $matchcheck = {}; | ||||
| 185 | 0 | $matchcheck->{'source_matchpoint'} = $source_matchpoint; | ||||
| 186 | 0 | $matchcheck->{'target_matchpoint'} = $target_matchpoint; | ||||
| 187 | 0 0 | push @{ $self->{'required_checks'} }, $matchcheck; | ||||
| 188 | } | |||||
| 189 | ||||||
| 190 | 0 | return $self; | ||||
| 191 | } | |||||
| 192 | ||||||
| 193 | sub _fetch_matchpoint { | |||||
| 194 | 0 | my $self = shift; | ||||
| 195 | 0 | my $matchpoint_id = shift; | ||||
| 196 | ||||||
| 197 | 0 | my $dbh = C4::Context->dbh; | ||||
| 198 | 0 | my $sth = $dbh->prepare_cached("SELECT * FROM matchpoints WHERE matchpoint_id = ?"); | ||||
| 199 | 0 | $sth->execute($matchpoint_id); | ||||
| 200 | 0 | my $row = $sth->fetchrow_hashref; | ||||
| 201 | 0 | my $matchpoint = {}; | ||||
| 202 | 0 | $matchpoint->{'index'} = $row->{'search_index'}; | ||||
| 203 | 0 | $matchpoint->{'score'} = int($row->{'score'}); | ||||
| 204 | 0 | $sth->finish(); | ||||
| 205 | ||||||
| 206 | 0 | $matchpoint->{'components'} = []; | ||||
| 207 | 0 | $sth = $dbh->prepare_cached("SELECT * FROM matchpoint_components WHERE matchpoint_id = ? ORDER BY sequence"); | ||||
| 208 | 0 | $sth->execute($matchpoint_id); | ||||
| 209 | 0 | while ($row = $sth->fetchrow_hashref) { | ||||
| 210 | 0 | my $component = {}; | ||||
| 211 | 0 | $component->{'tag'} = $row->{'tag'}; | ||||
| 212 | 0 0 | $component->{'subfields'} = { map { $_ => 1 } split(//, $row->{'subfields'}) }; | ||||
| 213 | 0 | $component->{'offset'} = int($row->{'offset'}); | ||||
| 214 | 0 | $component->{'length'} = int($row->{'length'}); | ||||
| 215 | 0 | $component->{'norms'} = []; | ||||
| 216 | 0 | my $sth2 = $dbh->prepare_cached("SELECT * | ||||
| 217 | FROM matchpoint_component_norms | |||||
| 218 | WHERE matchpoint_component_id = ? ORDER BY sequence"); | |||||
| 219 | 0 | $sth2->execute($row->{'matchpoint_component_id'}); | ||||
| 220 | 0 | while (my $row2 = $sth2->fetchrow_hashref) { | ||||
| 221 | 0 0 | push @{ $component->{'norms'} }, $row2->{'norm_routine'}; | ||||
| 222 | } | |||||
| 223 | 0 0 | push @{ $matchpoint->{'components'} }, $component; | ||||
| 224 | } | |||||
| 225 | 0 | return $matchpoint; | ||||
| 226 | } | |||||
| 227 | ||||||
| 228 - 238 | =head2 store my $id = $matcher->store(); Stores matcher in database. The return value is the ID of the marc_matchers row. If the matcher was previously retrieved from the database via the fetch() method, the DB representation of the matcher is replaced. =cut | |||||
| 239 | ||||||
| 240 | sub store { | |||||
| 241 | 0 | my $self = shift; | ||||
| 242 | ||||||
| 243 | 0 | if (defined $self->{'id'}) { | ||||
| 244 | # update | |||||
| 245 | 0 | $self->_del_matcher_components(); | ||||
| 246 | 0 | $self->_update_marc_matchers(); | ||||
| 247 | } else { | |||||
| 248 | # create new | |||||
| 249 | 0 | $self->_new_marc_matchers(); | ||||
| 250 | } | |||||
| 251 | 0 | $self->_store_matcher_components(); | ||||
| 252 | 0 | return $self->{'id'}; | ||||
| 253 | } | |||||
| 254 | ||||||
| 255 | sub _del_matcher_components { | |||||
| 256 | 0 | my $self = shift; | ||||
| 257 | ||||||
| 258 | 0 | my $dbh = C4::Context->dbh(); | ||||
| 259 | 0 | my $sth = $dbh->prepare_cached("DELETE FROM matchpoints WHERE matcher_id = ?"); | ||||
| 260 | 0 | $sth->execute($self->{'id'}); | ||||
| 261 | 0 | $sth = $dbh->prepare_cached("DELETE FROM matchchecks WHERE matcher_id = ?"); | ||||
| 262 | 0 | $sth->execute($self->{'id'}); | ||||
| 263 | # foreign key delete cascades take care of deleting relevant rows | |||||
| 264 | # from matcher_matchpoints, matchpoint_components, and | |||||
| 265 | # matchpoint_component_norms | |||||
| 266 | } | |||||
| 267 | ||||||
| 268 | sub _update_marc_matchers { | |||||
| 269 | 0 | my $self = shift; | ||||
| 270 | ||||||
| 271 | 0 | my $dbh = C4::Context->dbh(); | ||||
| 272 | 0 | my $sth = $dbh->prepare_cached("UPDATE marc_matchers | ||||
| 273 | SET code = ?, | |||||
| 274 | description = ?, | |||||
| 275 | record_type = ?, | |||||
| 276 | threshold = ? | |||||
| 277 | WHERE matcher_id = ?"); | |||||
| 278 | 0 | $sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'}, $self->{'id'}); | ||||
| 279 | } | |||||
| 280 | ||||||
| 281 | sub _new_marc_matchers { | |||||
| 282 | 0 | my $self = shift; | ||||
| 283 | ||||||
| 284 | 0 | my $dbh = C4::Context->dbh(); | ||||
| 285 | 0 | my $sth = $dbh->prepare_cached("INSERT INTO marc_matchers | ||||
| 286 | (code, description, record_type, threshold) | |||||
| 287 | VALUES (?, ?, ?, ?)"); | |||||
| 288 | 0 | $sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'}); | ||||
| 289 | 0 | $self->{'id'} = $dbh->{'mysql_insertid'}; | ||||
| 290 | } | |||||
| 291 | ||||||
| 292 | sub _store_matcher_components { | |||||
| 293 | 0 | my $self = shift; | ||||
| 294 | ||||||
| 295 | 0 | my $dbh = C4::Context->dbh(); | ||||
| 296 | 0 | my $sth; | ||||
| 297 | 0 | my $matcher_id = $self->{'id'}; | ||||
| 298 | 0 0 | foreach my $matchpoint (@{ $self->{'matchpoints'}}) { | ||||
| 299 | 0 | my $matchpoint_id = $self->_store_matchpoint($matchpoint); | ||||
| 300 | 0 | $sth = $dbh->prepare_cached("INSERT INTO matcher_matchpoints (matcher_id, matchpoint_id) | ||||
| 301 | VALUES (?, ?)"); | |||||
| 302 | 0 | $sth->execute($matcher_id, $matchpoint_id); | ||||
| 303 | } | |||||
| 304 | 0 0 | foreach my $matchcheck (@{ $self->{'required_checks'} }) { | ||||
| 305 | 0 | my $source_matchpoint_id = $self->_store_matchpoint($matchcheck->{'source_matchpoint'}); | ||||
| 306 | 0 | my $target_matchpoint_id = $self->_store_matchpoint($matchcheck->{'target_matchpoint'}); | ||||
| 307 | 0 | $sth = $dbh->prepare_cached("INSERT INTO matchchecks | ||||
| 308 | (matcher_id, source_matchpoint_id, target_matchpoint_id) | |||||
| 309 | VALUES (?, ?, ?)"); | |||||
| 310 | 0 | $sth->execute($matcher_id, $source_matchpoint_id, $target_matchpoint_id); | ||||
| 311 | } | |||||
| 312 | ||||||
| 313 | } | |||||
| 314 | ||||||
| 315 | sub _store_matchpoint { | |||||
| 316 | 0 | my $self = shift; | ||||
| 317 | 0 | my $matchpoint = shift; | ||||
| 318 | ||||||
| 319 | 0 | my $dbh = C4::Context->dbh(); | ||||
| 320 | 0 | my $sth; | ||||
| 321 | 0 | my $matcher_id = $self->{'id'}; | ||||
| 322 | 0 | $sth = $dbh->prepare_cached("INSERT INTO matchpoints (matcher_id, search_index, score) | ||||
| 323 | VALUES (?, ?, ?)"); | |||||
| 324 | 0 | $sth->execute($matcher_id, $matchpoint->{'index'}, $matchpoint->{'score'}); | ||||
| 325 | 0 | my $matchpoint_id = $dbh->{'mysql_insertid'}; | ||||
| 326 | 0 | my $seqnum = 0; | ||||
| 327 | 0 0 | foreach my $component (@{ $matchpoint->{'components'} }) { | ||||
| 328 | 0 | $seqnum++; | ||||
| 329 | 0 | $sth = $dbh->prepare_cached("INSERT INTO matchpoint_components | ||||
| 330 | (matchpoint_id, sequence, tag, subfields, offset, length) | |||||
| 331 | VALUES (?, ?, ?, ?, ?, ?)"); | |||||
| 332 | 0 | $sth->bind_param(1, $matchpoint_id); | ||||
| 333 | 0 | $sth->bind_param(2, $seqnum); | ||||
| 334 | 0 | $sth->bind_param(3, $component->{'tag'}); | ||||
| 335 | 0 0 | $sth->bind_param(4, join "", sort keys %{ $component->{'subfields'} }); | ||||
| 336 | 0 | $sth->bind_param(5, $component->{'offset'}); | ||||
| 337 | 0 | $sth->bind_param(6, $component->{'length'}); | ||||
| 338 | 0 | $sth->execute(); | ||||
| 339 | 0 | my $matchpoint_component_id = $dbh->{'mysql_insertid'}; | ||||
| 340 | 0 | my $normseq = 0; | ||||
| 341 | 0 0 | foreach my $norm (@{ $component->{'norms'} }) { | ||||
| 342 | 0 | $normseq++; | ||||
| 343 | 0 | $sth = $dbh->prepare_cached("INSERT INTO matchpoint_component_norms | ||||
| 344 | (matchpoint_component_id, sequence, norm_routine) | |||||
| 345 | VALUES (?, ?, ?)"); | |||||
| 346 | 0 | $sth->execute($matchpoint_component_id, $normseq, $norm); | ||||
| 347 | } | |||||
| 348 | } | |||||
| 349 | 0 | return $matchpoint_id; | ||||
| 350 | } | |||||
| 351 | ||||||
| 352 | ||||||
| 353 - 360 | =head2 delete C4::Matcher->delete($id); Deletes the matcher of the specified ID from the database. =cut | |||||
| 361 | ||||||
| 362 | sub delete { | |||||
| 363 | 0 | my $class = shift; | ||||
| 364 | 0 | my $matcher_id = shift; | ||||
| 365 | ||||||
| 366 | 0 | my $dbh = C4::Context->dbh; | ||||
| 367 | 0 | my $sth = $dbh->prepare("DELETE FROM marc_matchers WHERE matcher_id = ?"); | ||||
| 368 | 0 | $sth->execute($matcher_id); # relying on cascading deletes to clean up everything | ||||
| 369 | } | |||||
| 370 | ||||||
| 371 - 378 | =head2 threshold $matcher->threshold(1000); my $threshold = $matcher->threshold(); Accessor method. =cut | |||||
| 379 | ||||||
| 380 | sub threshold { | |||||
| 381 | 0 | my $self = shift; | ||||
| 382 | 0 | @_ ? $self->{'threshold'} = shift : $self->{'threshold'}; | ||||
| 383 | } | |||||
| 384 | ||||||
| 385 - 394 | =head2 _id $matcher->_id(123); my $id = $matcher->_id(); Accessor method. Note that using this method to set the DB ID of the matcher should not be done outside of the editing CGI. =cut | |||||
| 395 | ||||||
| 396 | sub _id { | |||||
| 397 | 0 | my $self = shift; | ||||
| 398 | 0 | @_ ? $self->{'id'} = shift : $self->{'id'}; | ||||
| 399 | } | |||||
| 400 | ||||||
| 401 - 408 | =head2 code
$matcher->code('ISBN');
my $code = $matcher->code();
Accessor method.
=cut | |||||
| 409 | ||||||
| 410 | sub code { | |||||
| 411 | 0 | my $self = shift; | ||||
| 412 | 0 | @_ ? $self->{'code'} = shift : $self->{'code'}; | ||||
| 413 | } | |||||
| 414 | ||||||
| 415 - 422 | =head2 description
$matcher->description('match on ISBN');
my $description = $matcher->description();
Accessor method.
=cut | |||||
| 423 | ||||||
| 424 | sub description { | |||||
| 425 | 0 | my $self = shift; | ||||
| 426 | 0 | @_ ? $self->{'description'} = shift : $self->{'description'}; | ||||
| 427 | } | |||||
| 428 | ||||||
| 429 - 451 | =head2 add_matchpoint
$matcher->add_matchpoint($index, $score, $matchcomponents);
Adds a matchpoint that may include multiple components. The $index
parameter identifies the index that will be searched, while $score
is the weight that will be added if a match is found.
$matchcomponents should be a reference to an array of matchpoint
compoents, each of which should be a hash containing the following
keys:
tag
subfields
offset
length
norms
The normalization_rules value should in turn be a reference to an
array, each element of which should be a reference to a
normalization subroutine (under C4::Normalize) to be applied
to the source string.
=cut | |||||
| 452 | ||||||
| 453 | sub add_matchpoint { | |||||
| 454 | 0 | my $self = shift; | ||||
| 455 | 0 | my ($index, $score, $matchcomponents) = @_; | ||||
| 456 | ||||||
| 457 | 0 | my $matchpoint = {}; | ||||
| 458 | 0 | $matchpoint->{'index'} = $index; | ||||
| 459 | 0 | $matchpoint->{'score'} = $score; | ||||
| 460 | 0 | $matchpoint->{'components'} = []; | ||||
| 461 | 0 0 | foreach my $input_component (@{ $matchcomponents }) { | ||||
| 462 | 0 0 | push @{ $matchpoint->{'components'} }, _parse_match_component($input_component); | ||||
| 463 | } | |||||
| 464 | 0 0 | push @{ $self->{'matchpoints'} }, $matchpoint; | ||||
| 465 | } | |||||
| 466 | ||||||
| 467 - 478 | =head2 add_simple_matchpoint
$matcher->add_simple_matchpoint($index, $score, $source_tag,
$source_subfields, $source_offset,
$source_length, $source_normalizer);
Adds a simple matchpoint rule -- after composing a key based on the source tag and subfields,
normalized per the normalization fuction, search the index. All records retrieved
will receive the assigned score.
=cut | |||||
| 479 | ||||||
| 480 | sub add_simple_matchpoint { | |||||
| 481 | 0 | my $self = shift; | ||||
| 482 | 0 | my ($index, $score, $source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer) = @_; | ||||
| 483 | ||||||
| 484 | 0 | $self->add_matchpoint($index, $score, [ | ||||
| 485 | { tag => $source_tag, subfields => $source_subfields, | |||||
| 486 | offset => $source_offset, 'length' => $source_length, | |||||
| 487 | norms => [ $source_normalizer ] | |||||
| 488 | } | |||||
| 489 | ]); | |||||
| 490 | } | |||||
| 491 | ||||||
| 492 - 522 | =head2 add_required_check
$match->add_required_check($source_matchpoint, $target_matchpoint);
Adds a required check definition. A required check means that in
order for a match to be considered valid, the key derived from the
source (incoming) record must match the key derived from the target
(already in DB) record.
Unlike a regular matchpoint, only the first repeat of each tag
in the source and target match criteria are considered.
A typical example of a required check would be verifying that the
titles and publication dates match.
$source_matchpoint and $target_matchpoint are each a reference to
an array of hashes, where each hash follows the same definition
as the matchpoint component specification in add_matchpoint, i.e.,
tag
subfields
offset
length
norms
The normalization_rules value should in turn be a reference to an
array, each element of which should be a reference to a
normalization subroutine (under C4::Normalize) to be applied
to the source string.
=cut | |||||
| 523 | ||||||
| 524 | sub add_required_check { | |||||
| 525 | 0 | my $self = shift; | ||||
| 526 | 0 | my ($source_matchpoint, $target_matchpoint) = @_; | ||||
| 527 | ||||||
| 528 | 0 | my $matchcheck = {}; | ||||
| 529 | 0 | $matchcheck->{'source_matchpoint'}->{'index'} = ''; | ||||
| 530 | 0 | $matchcheck->{'source_matchpoint'}->{'score'} = 0; | ||||
| 531 | 0 | $matchcheck->{'source_matchpoint'}->{'components'} = []; | ||||
| 532 | 0 | $matchcheck->{'target_matchpoint'}->{'index'} = ''; | ||||
| 533 | 0 | $matchcheck->{'target_matchpoint'}->{'score'} = 0; | ||||
| 534 | 0 | $matchcheck->{'target_matchpoint'}->{'components'} = []; | ||||
| 535 | 0 0 | foreach my $input_component (@{ $source_matchpoint }) { | ||||
| 536 | 0 0 | push @{ $matchcheck->{'source_matchpoint'}->{'components'} }, _parse_match_component($input_component); | ||||
| 537 | } | |||||
| 538 | 0 0 | foreach my $input_component (@{ $target_matchpoint }) { | ||||
| 539 | 0 0 | push @{ $matchcheck->{'target_matchpoint'}->{'components'} }, _parse_match_component($input_component); | ||||
| 540 | } | |||||
| 541 | 0 0 | push @{ $self->{'required_checks'} }, $matchcheck; | ||||
| 542 | } | |||||
| 543 | ||||||
| 544 - 554 | =head2 add_simple_required_check
$matcher->add_simple_required_check($source_tag, $source_subfields,
$source_offset, $source_length, $source_normalizer,
$target_tag, $target_subfields, $target_offset,
$target_length, $target_normalizer);
Adds a required check, which requires that the normalized keys made from the source and targets
must match for a match to be considered valid.
=cut | |||||
| 555 | ||||||
| 556 | sub add_simple_required_check { | |||||
| 557 | 0 | my $self = shift; | ||||
| 558 | 0 | my ($source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer, | ||||
| 559 | $target_tag, $target_subfields, $target_offset, $target_length, $target_normalizer) = @_; | |||||
| 560 | ||||||
| 561 | 0 | $self->add_required_check( | ||||
| 562 | [ { tag => $source_tag, subfields => $source_subfields, offset => $source_offset, 'length' => $source_length, | |||||
| 563 | norms => [ $source_normalizer ] } ], | |||||
| 564 | [ { tag => $target_tag, subfields => $target_subfields, offset => $target_offset, 'length' => $target_length, | |||||
| 565 | norms => [ $target_normalizer ] } ] | |||||
| 566 | ); | |||||
| 567 | } | |||||
| 568 | ||||||
| 569 - 593 | =head2 find_matches
my @matches = $matcher->get_matches($marc_record, $max_matches);
foreach $match (@matches) {
# matches already sorted in order of
# decreasing score
print "record ID: $match->{'record_id'};
print "score: $match->{'score'};
}
Identifies all of the records matching the given MARC record. For a record already
in the database to be considered a match, it must meet the following criteria:
=over 2
=item 1. Total score from its matching field must exceed the supplied threshold.
=item 2. It must pass all required checks.
=back
Only the top $max_matches matches are returned. The returned array is sorted
in order of decreasing score, i.e., the best match is first.
=cut | |||||
| 594 | ||||||
| 595 | sub get_matches { | |||||
| 596 | 0 | my $self = shift; | ||||
| 597 | 0 | my ($source_record, $max_matches) = @_; | ||||
| 598 | ||||||
| 599 | 0 | my %matches = (); | ||||
| 600 | ||||||
| 601 | 0 0 | foreach my $matchpoint (@{ $self->{'matchpoints'} }) { | ||||
| 602 | 0 | my @source_keys = _get_match_keys($source_record, $matchpoint); | ||||
| 603 | 0 | next if scalar(@source_keys) == 0; | ||||
| 604 | # build query | |||||
| 605 | 0 0 | my $query = join(" or ", map { "$matchpoint->{'index'}=$_" } @source_keys); | ||||
| 606 | # FIXME only searching biblio index at the moment | |||||
| 607 | 0 | my ($error, $searchresults, $total_hits) = SimpleSearch($query, 0, $max_matches); | ||||
| 608 | ||||||
| 609 | 0 | if (defined $error ) { | ||||
| 610 | 0 | warn "search failed ($query) $error"; | ||||
| 611 | } else { | |||||
| 612 | 0 0 | foreach my $matched (@{$searchresults}) { | ||||
| 613 | 0 | $matches{$matched} += $matchpoint->{'score'}; | ||||
| 614 | } | |||||
| 615 | } | |||||
| 616 | } | |||||
| 617 | ||||||
| 618 | # get rid of any that don't meet the threshold | |||||
| 619 | 0 0 | %matches = map { ($matches{$_} >= $self->{'threshold'}) ? ($_ => $matches{$_}) : () } keys %matches; | ||||
| 620 | ||||||
| 621 | # get rid of any that don't meet the required checks | |||||
| 622 | 0 0 | %matches = map { _passes_required_checks($source_record, $_, $self->{'required_checks'}) ? ($_ => $matches{$_}) : () } | ||||
| 623 | keys %matches; | |||||
| 624 | ||||||
| 625 | 0 | my @results = (); | ||||
| 626 | 0 | foreach my $marcblob (keys %matches) { | ||||
| 627 | 0 | my $target_record = MARC::Record->new_from_usmarc($marcblob); | ||||
| 628 | 0 | my $result = TransformMarcToKoha(C4::Context->dbh, $target_record, ''); | ||||
| 629 | # FIXME - again, bibliospecific | |||||
| 630 | # also, can search engine be induced to give just the number in the first place? | |||||
| 631 | 0 | my $record_number = $result->{'biblionumber'}; | ||||
| 632 | 0 | push @results, { 'record_id' => $record_number, 'score' => $matches{$marcblob} }; | ||||
| 633 | } | |||||
| 634 | 0 0 | @results = sort { $b->{'score'} cmp $a->{'score'} } @results; | ||||
| 635 | 0 | if (scalar(@results) > $max_matches) { | ||||
| 636 | 0 | @results = @results[0..$max_matches-1]; | ||||
| 637 | } | |||||
| 638 | 0 | return @results; | ||||
| 639 | ||||||
| 640 | } | |||||
| 641 | ||||||
| 642 - 650 | =head2 dump $description = $matcher->dump(); Returns a reference to a structure containing all of the information in the matcher object. This is mainly a convenience method to aid setting up a HTML editing form. =cut | |||||
| 651 | ||||||
| 652 | sub dump { | |||||
| 653 | 0 | my $self = shift; | ||||
| 654 | ||||||
| 655 | 0 | my $result = {}; | ||||
| 656 | ||||||
| 657 | 0 | $result->{'matcher_id'} = $self->{'id'}; | ||||
| 658 | 0 | $result->{'code'} = $self->{'code'}; | ||||
| 659 | 0 | $result->{'description'} = $self->{'description'}; | ||||
| 660 | ||||||
| 661 | 0 | $result->{'matchpoints'} = []; | ||||
| 662 | 0 0 | foreach my $matchpoint (@{ $self->{'matchpoints'} }) { | ||||
| 663 | 0 0 | push @{ $result->{'matchpoints'} }, $matchpoint; | ||||
| 664 | } | |||||
| 665 | 0 | $result->{'matchchecks'} = []; | ||||
| 666 | 0 0 | foreach my $matchcheck (@{ $self->{'required_checks'} }) { | ||||
| 667 | 0 0 | push @{ $result->{'matchchecks'} }, $matchcheck; | ||||
| 668 | } | |||||
| 669 | ||||||
| 670 | 0 | return $result; | ||||
| 671 | } | |||||
| 672 | ||||||
| 673 | sub _passes_required_checks { | |||||
| 674 | 0 | my ($source_record, $target_blob, $matchchecks) = @_; | ||||
| 675 | 0 | my $target_record = MARC::Record->new_from_usmarc($target_blob); # FIXME -- need to avoid parsing record twice | ||||
| 676 | ||||||
| 677 | # no checks supplied == automatic pass | |||||
| 678 | 0 0 | return 1 if $#{ $matchchecks } == -1; | ||||
| 679 | ||||||
| 680 | 0 0 | foreach my $matchcheck (@{ $matchchecks }) { | ||||
| 681 | 0 | my $source_key = join "", _get_match_keys($source_record, $matchcheck->{'source_matchpoint'}); | ||||
| 682 | 0 | my $target_key = join "", _get_match_keys($target_record, $matchcheck->{'target_matchpoint'}); | ||||
| 683 | 0 | return 0 unless $source_key eq $target_key; | ||||
| 684 | } | |||||
| 685 | 0 | return 1; | ||||
| 686 | } | |||||
| 687 | ||||||
| 688 | sub _get_match_keys { | |||||
| 689 | 0 | my $source_record = shift; | ||||
| 690 | 0 | my $matchpoint = shift; | ||||
| 691 | 0 | my $check_only_first_repeat = @_ ? shift : 0; | ||||
| 692 | ||||||
| 693 | # If there is more than one component to the matchpoint (e.g., | |||||
| 694 | # matchpoint includes both 003 and 001), any repeats | |||||
| 695 | # of the first component's tag are identified; repeats | |||||
| 696 | # of the subsequent components' tags are appended to | |||||
| 697 | # each parallel key dervied from the first component, | |||||
| 698 | # up to the number of repeats of the first component's tag. | |||||
| 699 | # | |||||
| 700 | # For example, if the record has one 003 and two 001s, only | |||||
| 701 | # one key is retrieved because there is only one 003. The key | |||||
| 702 | # will consist of the contents of the first 003 and first 001. | |||||
| 703 | # | |||||
| 704 | # If there are two 003s and two 001s, there will be two keys: | |||||
| 705 | # first 003 + first 001 | |||||
| 706 | # second 003 + second 001 | |||||
| 707 | ||||||
| 708 | 0 | my @keys = (); | ||||
| 709 | 0 | for (my $i = 0; $i <= $#{ $matchpoint->{'components'} }; $i++) { | ||||
| 710 | 0 | my $component = $matchpoint->{'components'}->[$i]; | ||||
| 711 | 0 | my $j = -1; | ||||
| 712 | 0 | FIELD: foreach my $field ($source_record->field($component->{'tag'})) { | ||||
| 713 | 0 | $j++; | ||||
| 714 | 0 | last FIELD if $j > 0 and $check_only_first_repeat; | ||||
| 715 | 0 | last FIELD if $i > 0 and $j > $#keys; | ||||
| 716 | 0 | my $key = ""; | ||||
| 717 | 0 | my $string; | ||||
| 718 | 0 | if ($field->is_control_field()) { | ||||
| 719 | 0 | $string=$field->data(); | ||||
| 720 | } else { | |||||
| 721 | 0 | foreach my $subfield ($field->subfields()) { | ||||
| 722 | 0 | if (exists $component->{'subfields'}->{$subfield->[0]}) { | ||||
| 723 | 0 | $string .= " " . $subfield->[1]; | ||||
| 724 | } | |||||
| 725 | } | |||||
| 726 | } | |||||
| 727 | 0 | if ($component->{'length'}>0) { | ||||
| 728 | 0 | $string= substr($string, $component->{'offset'}, $component->{'length'}); | ||||
| 729 | # FIXME normalize, substr | |||||
| 730 | } elsif ($component->{'offset'}) { | |||||
| 731 | 0 | $string= substr($string, $component->{'offset'}); | ||||
| 732 | } | |||||
| 733 | 0 | $key = _normalize($string); | ||||
| 734 | 0 | if ($i == 0) { | ||||
| 735 | 0 | push @keys, $key if $key; | ||||
| 736 | } else { | |||||
| 737 | 0 | $keys[$j] .= " $key" if $key; | ||||
| 738 | } | |||||
| 739 | } | |||||
| 740 | 0 | } | ||||
| 741 | 0 | return @keys; | ||||
| 742 | } | |||||
| 743 | ||||||
| 744 | ||||||
| 745 | sub _parse_match_component { | |||||
| 746 | 0 | my $input_component = shift; | ||||
| 747 | ||||||
| 748 | 0 | my $component = {}; | ||||
| 749 | 0 | $component->{'tag'} = $input_component->{'tag'}; | ||||
| 750 | 0 0 | $component->{'subfields'} = { map { $_ => 1 } split(//, $input_component->{'subfields'}) }; | ||||
| 751 | 0 | $component->{'offset'} = exists($input_component->{'offset'}) ? $input_component->{'offset'} : -1; | ||||
| 752 | 0 | $component->{'length'} = $input_component->{'length'} ? $input_component->{'length'} : 0; | ||||
| 753 | 0 | $component->{'norms'} = $input_component->{'norms'} ? $input_component->{'norms'} : []; | ||||
| 754 | ||||||
| 755 | 0 | return $component; | ||||
| 756 | } | |||||
| 757 | ||||||
| 758 | # FIXME - default normalizer | |||||
| 759 | sub _normalize { | |||||
| 760 | 0 | my $value = uc shift; | ||||
| 761 | 0 | $value =~ s/[.;:,\]\[\)\(\/'"]//g; | ||||
| 762 | 0 | $value =~ s/^\s+//; | ||||
| 763 | #$value =~ s/^\s+$//; | |||||
| 764 | 0 | $value =~ s/\s+$//; | ||||
| 765 | 0 | $value =~ s/\s+/ /g; | ||||
| 766 | #$value =~ s/[.;,\]\[\)\(\/"']//g; | |||||
| 767 | 0 | return $value; | ||||
| 768 | } | |||||
| 769 | ||||||
| 770 | 1; | |||||