File: | C4/Matcher.pm |
Coverage: | 7.2% |
line | stmt | bran | cond | sub | time | code |
---|---|---|---|---|---|---|
1 | package C4::Matcher; | |||||
2 | ||||||
3 | # Copyright (C) 2007 LibLime | |||||
4 | # | |||||
5 | # This file is part of Koha. | |||||
6 | # | |||||
7 | # Koha is free software; you can redistribute it and/or modify it under the | |||||
8 | # terms of the GNU General Public License as published by the Free Software | |||||
9 | # Foundation; either version 2 of the License, or (at your option) any later | |||||
10 | # version. | |||||
11 | # | |||||
12 | # Koha is distributed in the hope that it will be useful, but WITHOUT ANY | |||||
13 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR | |||||
14 | # A PARTICULAR PURPOSE. See the GNU General Public License for more details. | |||||
15 | # | |||||
16 | # You should have received a copy of the GNU General Public License along | |||||
17 | # with Koha; if not, write to the Free Software Foundation, Inc., | |||||
18 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | |||||
19 | ||||||
20 | 3 3 3 | 19410 9 97 | use strict; | |||
21 | 3 3 3 | 15 30 138 | use warnings; | |||
22 | ||||||
23 | 3 3 3 | 205 18 41 | use C4::Context; | |||
24 | 3 3 3 | 354 7677 169 | use MARC::Record; | |||
25 | 3 3 3 | 277 6 583 | use C4::Search; | |||
26 | 3 3 3 | 34 5 1438 | use C4::Biblio; | |||
27 | ||||||
28 | 3 3 3 | 14 4 140 | use vars qw($VERSION); | |||
29 | ||||||
30 | BEGIN { | |||||
31 | # set the version for version checking | |||||
32 | 3 | 12798 | $VERSION = 3.01; | |||
33 | } | |||||
34 | ||||||
35 - 71 | =head1 NAME C4::Matcher - find MARC records matching another one =head1 SYNOPSIS my @matchers = C4::Matcher::GetMatcherList(); my $matcher = C4::Matcher->new($record_type); $matcher->threshold($threshold); $matcher->code($code); $matcher->description($description); $matcher->add_simple_matchpoint('isbn', 1000, '020', 'a', -1, 0, ''); $matcher->add_simple_matchpoint('Date', 1000, '008', '', 7, 4, ''); $matcher->add_matchpoint('isbn', 1000, [ { tag => '020', subfields => 'a', norms => [] } ]); $matcher->add_simple_required_check('245', 'a', -1, 0, '', '245', 'a', -1, 0, ''); $matcher->add_required_check([ { tag => '245', subfields => 'a', norms => [] } ], [ { tag => '245', subfields => 'a', norms => [] } ]); my @matches = $matcher->get_matches($marc_record, $max_matches); foreach $match (@matches) { # matches already sorted in order of # decreasing score print "record ID: $match->{'record_id'}; print "score: $match->{'score'}; } my $matcher_description = $matcher->dump(); =head1 FUNCTIONS =cut | |||||
72 | ||||||
73 - 84 | =head2 GetMatcherList my @matchers = C4::Matcher::GetMatcherList(); Returns an array of hashrefs list all matchers present in the database. Each hashref includes: * matcher_id * code * description =cut | |||||
85 | ||||||
86 | sub GetMatcherList { | |||||
87 | 0 | my $dbh = C4::Context->dbh; | ||||
88 | ||||||
89 | 0 | my $sth = $dbh->prepare_cached("SELECT matcher_id, code, description FROM marc_matchers ORDER BY matcher_id"); | ||||
90 | 0 | $sth->execute(); | ||||
91 | 0 | my @results = (); | ||||
92 | 0 | while (my $row = $sth->fetchrow_hashref) { | ||||
93 | 0 | push @results, $row; | ||||
94 | } | |||||
95 | 0 | return @results; | ||||
96 | } | |||||
97 | ||||||
98 - 109 | =head1 METHODS =head2 new my $matcher = C4::Matcher->new($record_type, $threshold); Creates a new Matcher. C<$record_type> indicates which search database to use, e.g., 'biblio' or 'authority' and defaults to 'biblio', while C<$threshold> is the minimum score required for a match and defaults to 1000. =cut | |||||
110 | ||||||
111 | sub new { | |||||
112 | 0 | my $class = shift; | ||||
113 | 0 | my $self = {}; | ||||
114 | ||||||
115 | 0 | $self->{'id'} = undef; | ||||
116 | ||||||
117 | 0 | if ($#_ > -1) { | ||||
118 | 0 | $self->{'record_type'} = shift; | ||||
119 | } else { | |||||
120 | 0 | $self->{'record_type'} = 'biblio'; | ||||
121 | } | |||||
122 | ||||||
123 | 0 | if ($#_ > -1) { | ||||
124 | 0 | $self->{'threshold'} = shift; | ||||
125 | } else { | |||||
126 | 0 | $self->{'threshold'} = 1000; | ||||
127 | } | |||||
128 | ||||||
129 | 0 | $self->{'code'} = ''; | ||||
130 | 0 | $self->{'description'} = ''; | ||||
131 | ||||||
132 | 0 | $self->{'matchpoints'} = []; | ||||
133 | 0 | $self->{'required_checks'} = []; | ||||
134 | ||||||
135 | 0 | bless $self, $class; | ||||
136 | 0 | return $self; | ||||
137 | } | |||||
138 | ||||||
139 - 147 | =head2 fetch my $matcher = C4::Matcher->fetch($id); Creates a matcher object from the version stored in the database. If a matcher with the given id does not exist, returns undef. =cut | |||||
148 | ||||||
149 | sub fetch { | |||||
150 | 0 | my $class = shift; | ||||
151 | 0 | my $id = shift; | ||||
152 | 0 | my $dbh = C4::Context->dbh(); | ||||
153 | ||||||
154 | 0 | my $sth = $dbh->prepare_cached("SELECT * FROM marc_matchers WHERE matcher_id = ?"); | ||||
155 | 0 | $sth->execute($id); | ||||
156 | 0 | my $row = $sth->fetchrow_hashref; | ||||
157 | 0 | $sth->finish(); | ||||
158 | 0 | return undef unless defined $row; | ||||
159 | ||||||
160 | 0 | my $self = {}; | ||||
161 | 0 | $self->{'id'} = $row->{'matcher_id'}; | ||||
162 | 0 | $self->{'record_type'} = $row->{'record_type'}; | ||||
163 | 0 | $self->{'code'} = $row->{'code'}; | ||||
164 | 0 | $self->{'description'} = $row->{'description'}; | ||||
165 | 0 | $self->{'threshold'} = int($row->{'threshold'}); | ||||
166 | 0 | bless $self, $class; | ||||
167 | ||||||
168 | # matchpoints | |||||
169 | 0 | $self->{'matchpoints'} = []; | ||||
170 | 0 | $sth = $dbh->prepare_cached("SELECT * FROM matcher_matchpoints WHERE matcher_id = ? ORDER BY matchpoint_id"); | ||||
171 | 0 | $sth->execute($self->{'id'}); | ||||
172 | 0 | while (my $row = $sth->fetchrow_hashref) { | ||||
173 | 0 | my $matchpoint = $self->_fetch_matchpoint($row->{'matchpoint_id'}); | ||||
174 | 0 0 | push @{ $self->{'matchpoints'} }, $matchpoint; | ||||
175 | } | |||||
176 | ||||||
177 | # required checks | |||||
178 | 0 | $self->{'required_checks'} = []; | ||||
179 | 0 | $sth = $dbh->prepare_cached("SELECT * FROM matchchecks WHERE matcher_id = ? ORDER BY matchcheck_id"); | ||||
180 | 0 | $sth->execute($self->{'id'}); | ||||
181 | 0 | while (my $row = $sth->fetchrow_hashref) { | ||||
182 | 0 | my $source_matchpoint = $self->_fetch_matchpoint($row->{'source_matchpoint_id'}); | ||||
183 | 0 | my $target_matchpoint = $self->_fetch_matchpoint($row->{'target_matchpoint_id'}); | ||||
184 | 0 | my $matchcheck = {}; | ||||
185 | 0 | $matchcheck->{'source_matchpoint'} = $source_matchpoint; | ||||
186 | 0 | $matchcheck->{'target_matchpoint'} = $target_matchpoint; | ||||
187 | 0 0 | push @{ $self->{'required_checks'} }, $matchcheck; | ||||
188 | } | |||||
189 | ||||||
190 | 0 | return $self; | ||||
191 | } | |||||
192 | ||||||
193 | sub _fetch_matchpoint { | |||||
194 | 0 | my $self = shift; | ||||
195 | 0 | my $matchpoint_id = shift; | ||||
196 | ||||||
197 | 0 | my $dbh = C4::Context->dbh; | ||||
198 | 0 | my $sth = $dbh->prepare_cached("SELECT * FROM matchpoints WHERE matchpoint_id = ?"); | ||||
199 | 0 | $sth->execute($matchpoint_id); | ||||
200 | 0 | my $row = $sth->fetchrow_hashref; | ||||
201 | 0 | my $matchpoint = {}; | ||||
202 | 0 | $matchpoint->{'index'} = $row->{'search_index'}; | ||||
203 | 0 | $matchpoint->{'score'} = int($row->{'score'}); | ||||
204 | 0 | $sth->finish(); | ||||
205 | ||||||
206 | 0 | $matchpoint->{'components'} = []; | ||||
207 | 0 | $sth = $dbh->prepare_cached("SELECT * FROM matchpoint_components WHERE matchpoint_id = ? ORDER BY sequence"); | ||||
208 | 0 | $sth->execute($matchpoint_id); | ||||
209 | 0 | while ($row = $sth->fetchrow_hashref) { | ||||
210 | 0 | my $component = {}; | ||||
211 | 0 | $component->{'tag'} = $row->{'tag'}; | ||||
212 | 0 0 | $component->{'subfields'} = { map { $_ => 1 } split(//, $row->{'subfields'}) }; | ||||
213 | 0 | $component->{'offset'} = int($row->{'offset'}); | ||||
214 | 0 | $component->{'length'} = int($row->{'length'}); | ||||
215 | 0 | $component->{'norms'} = []; | ||||
216 | 0 | my $sth2 = $dbh->prepare_cached("SELECT * | ||||
217 | FROM matchpoint_component_norms | |||||
218 | WHERE matchpoint_component_id = ? ORDER BY sequence"); | |||||
219 | 0 | $sth2->execute($row->{'matchpoint_component_id'}); | ||||
220 | 0 | while (my $row2 = $sth2->fetchrow_hashref) { | ||||
221 | 0 0 | push @{ $component->{'norms'} }, $row2->{'norm_routine'}; | ||||
222 | } | |||||
223 | 0 0 | push @{ $matchpoint->{'components'} }, $component; | ||||
224 | } | |||||
225 | 0 | return $matchpoint; | ||||
226 | } | |||||
227 | ||||||
228 - 238 | =head2 store my $id = $matcher->store(); Stores matcher in database. The return value is the ID of the marc_matchers row. If the matcher was previously retrieved from the database via the fetch() method, the DB representation of the matcher is replaced. =cut | |||||
239 | ||||||
240 | sub store { | |||||
241 | 0 | my $self = shift; | ||||
242 | ||||||
243 | 0 | if (defined $self->{'id'}) { | ||||
244 | # update | |||||
245 | 0 | $self->_del_matcher_components(); | ||||
246 | 0 | $self->_update_marc_matchers(); | ||||
247 | } else { | |||||
248 | # create new | |||||
249 | 0 | $self->_new_marc_matchers(); | ||||
250 | } | |||||
251 | 0 | $self->_store_matcher_components(); | ||||
252 | 0 | return $self->{'id'}; | ||||
253 | } | |||||
254 | ||||||
255 | sub _del_matcher_components { | |||||
256 | 0 | my $self = shift; | ||||
257 | ||||||
258 | 0 | my $dbh = C4::Context->dbh(); | ||||
259 | 0 | my $sth = $dbh->prepare_cached("DELETE FROM matchpoints WHERE matcher_id = ?"); | ||||
260 | 0 | $sth->execute($self->{'id'}); | ||||
261 | 0 | $sth = $dbh->prepare_cached("DELETE FROM matchchecks WHERE matcher_id = ?"); | ||||
262 | 0 | $sth->execute($self->{'id'}); | ||||
263 | # foreign key delete cascades take care of deleting relevant rows | |||||
264 | # from matcher_matchpoints, matchpoint_components, and | |||||
265 | # matchpoint_component_norms | |||||
266 | } | |||||
267 | ||||||
268 | sub _update_marc_matchers { | |||||
269 | 0 | my $self = shift; | ||||
270 | ||||||
271 | 0 | my $dbh = C4::Context->dbh(); | ||||
272 | 0 | my $sth = $dbh->prepare_cached("UPDATE marc_matchers | ||||
273 | SET code = ?, | |||||
274 | description = ?, | |||||
275 | record_type = ?, | |||||
276 | threshold = ? | |||||
277 | WHERE matcher_id = ?"); | |||||
278 | 0 | $sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'}, $self->{'id'}); | ||||
279 | } | |||||
280 | ||||||
281 | sub _new_marc_matchers { | |||||
282 | 0 | my $self = shift; | ||||
283 | ||||||
284 | 0 | my $dbh = C4::Context->dbh(); | ||||
285 | 0 | my $sth = $dbh->prepare_cached("INSERT INTO marc_matchers | ||||
286 | (code, description, record_type, threshold) | |||||
287 | VALUES (?, ?, ?, ?)"); | |||||
288 | 0 | $sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'}); | ||||
289 | 0 | $self->{'id'} = $dbh->{'mysql_insertid'}; | ||||
290 | } | |||||
291 | ||||||
292 | sub _store_matcher_components { | |||||
293 | 0 | my $self = shift; | ||||
294 | ||||||
295 | 0 | my $dbh = C4::Context->dbh(); | ||||
296 | 0 | my $sth; | ||||
297 | 0 | my $matcher_id = $self->{'id'}; | ||||
298 | 0 0 | foreach my $matchpoint (@{ $self->{'matchpoints'}}) { | ||||
299 | 0 | my $matchpoint_id = $self->_store_matchpoint($matchpoint); | ||||
300 | 0 | $sth = $dbh->prepare_cached("INSERT INTO matcher_matchpoints (matcher_id, matchpoint_id) | ||||
301 | VALUES (?, ?)"); | |||||
302 | 0 | $sth->execute($matcher_id, $matchpoint_id); | ||||
303 | } | |||||
304 | 0 0 | foreach my $matchcheck (@{ $self->{'required_checks'} }) { | ||||
305 | 0 | my $source_matchpoint_id = $self->_store_matchpoint($matchcheck->{'source_matchpoint'}); | ||||
306 | 0 | my $target_matchpoint_id = $self->_store_matchpoint($matchcheck->{'target_matchpoint'}); | ||||
307 | 0 | $sth = $dbh->prepare_cached("INSERT INTO matchchecks | ||||
308 | (matcher_id, source_matchpoint_id, target_matchpoint_id) | |||||
309 | VALUES (?, ?, ?)"); | |||||
310 | 0 | $sth->execute($matcher_id, $source_matchpoint_id, $target_matchpoint_id); | ||||
311 | } | |||||
312 | ||||||
313 | } | |||||
314 | ||||||
315 | sub _store_matchpoint { | |||||
316 | 0 | my $self = shift; | ||||
317 | 0 | my $matchpoint = shift; | ||||
318 | ||||||
319 | 0 | my $dbh = C4::Context->dbh(); | ||||
320 | 0 | my $sth; | ||||
321 | 0 | my $matcher_id = $self->{'id'}; | ||||
322 | 0 | $sth = $dbh->prepare_cached("INSERT INTO matchpoints (matcher_id, search_index, score) | ||||
323 | VALUES (?, ?, ?)"); | |||||
324 | 0 | $sth->execute($matcher_id, $matchpoint->{'index'}, $matchpoint->{'score'}); | ||||
325 | 0 | my $matchpoint_id = $dbh->{'mysql_insertid'}; | ||||
326 | 0 | my $seqnum = 0; | ||||
327 | 0 0 | foreach my $component (@{ $matchpoint->{'components'} }) { | ||||
328 | 0 | $seqnum++; | ||||
329 | 0 | $sth = $dbh->prepare_cached("INSERT INTO matchpoint_components | ||||
330 | (matchpoint_id, sequence, tag, subfields, offset, length) | |||||
331 | VALUES (?, ?, ?, ?, ?, ?)"); | |||||
332 | 0 | $sth->bind_param(1, $matchpoint_id); | ||||
333 | 0 | $sth->bind_param(2, $seqnum); | ||||
334 | 0 | $sth->bind_param(3, $component->{'tag'}); | ||||
335 | 0 0 | $sth->bind_param(4, join "", sort keys %{ $component->{'subfields'} }); | ||||
336 | 0 | $sth->bind_param(5, $component->{'offset'}); | ||||
337 | 0 | $sth->bind_param(6, $component->{'length'}); | ||||
338 | 0 | $sth->execute(); | ||||
339 | 0 | my $matchpoint_component_id = $dbh->{'mysql_insertid'}; | ||||
340 | 0 | my $normseq = 0; | ||||
341 | 0 0 | foreach my $norm (@{ $component->{'norms'} }) { | ||||
342 | 0 | $normseq++; | ||||
343 | 0 | $sth = $dbh->prepare_cached("INSERT INTO matchpoint_component_norms | ||||
344 | (matchpoint_component_id, sequence, norm_routine) | |||||
345 | VALUES (?, ?, ?)"); | |||||
346 | 0 | $sth->execute($matchpoint_component_id, $normseq, $norm); | ||||
347 | } | |||||
348 | } | |||||
349 | 0 | return $matchpoint_id; | ||||
350 | } | |||||
351 | ||||||
352 | ||||||
353 - 360 | =head2 delete C4::Matcher->delete($id); Deletes the matcher of the specified ID from the database. =cut | |||||
361 | ||||||
362 | sub delete { | |||||
363 | 0 | my $class = shift; | ||||
364 | 0 | my $matcher_id = shift; | ||||
365 | ||||||
366 | 0 | my $dbh = C4::Context->dbh; | ||||
367 | 0 | my $sth = $dbh->prepare("DELETE FROM marc_matchers WHERE matcher_id = ?"); | ||||
368 | 0 | $sth->execute($matcher_id); # relying on cascading deletes to clean up everything | ||||
369 | } | |||||
370 | ||||||
371 - 378 | =head2 threshold $matcher->threshold(1000); my $threshold = $matcher->threshold(); Accessor method. =cut | |||||
379 | ||||||
380 | sub threshold { | |||||
381 | 0 | my $self = shift; | ||||
382 | 0 | @_ ? $self->{'threshold'} = shift : $self->{'threshold'}; | ||||
383 | } | |||||
384 | ||||||
385 - 394 | =head2 _id $matcher->_id(123); my $id = $matcher->_id(); Accessor method. Note that using this method to set the DB ID of the matcher should not be done outside of the editing CGI. =cut | |||||
395 | ||||||
396 | sub _id { | |||||
397 | 0 | my $self = shift; | ||||
398 | 0 | @_ ? $self->{'id'} = shift : $self->{'id'}; | ||||
399 | } | |||||
400 | ||||||
401 - 408 | =head2 code $matcher->code('ISBN'); my $code = $matcher->code(); Accessor method. =cut | |||||
409 | ||||||
410 | sub code { | |||||
411 | 0 | my $self = shift; | ||||
412 | 0 | @_ ? $self->{'code'} = shift : $self->{'code'}; | ||||
413 | } | |||||
414 | ||||||
415 - 422 | =head2 description $matcher->description('match on ISBN'); my $description = $matcher->description(); Accessor method. =cut | |||||
423 | ||||||
424 | sub description { | |||||
425 | 0 | my $self = shift; | ||||
426 | 0 | @_ ? $self->{'description'} = shift : $self->{'description'}; | ||||
427 | } | |||||
428 | ||||||
429 - 451 | =head2 add_matchpoint $matcher->add_matchpoint($index, $score, $matchcomponents); Adds a matchpoint that may include multiple components. The $index parameter identifies the index that will be searched, while $score is the weight that will be added if a match is found. $matchcomponents should be a reference to an array of matchpoint compoents, each of which should be a hash containing the following keys: tag subfields offset length norms The normalization_rules value should in turn be a reference to an array, each element of which should be a reference to a normalization subroutine (under C4::Normalize) to be applied to the source string. =cut | |||||
452 | ||||||
453 | sub add_matchpoint { | |||||
454 | 0 | my $self = shift; | ||||
455 | 0 | my ($index, $score, $matchcomponents) = @_; | ||||
456 | ||||||
457 | 0 | my $matchpoint = {}; | ||||
458 | 0 | $matchpoint->{'index'} = $index; | ||||
459 | 0 | $matchpoint->{'score'} = $score; | ||||
460 | 0 | $matchpoint->{'components'} = []; | ||||
461 | 0 0 | foreach my $input_component (@{ $matchcomponents }) { | ||||
462 | 0 0 | push @{ $matchpoint->{'components'} }, _parse_match_component($input_component); | ||||
463 | } | |||||
464 | 0 0 | push @{ $self->{'matchpoints'} }, $matchpoint; | ||||
465 | } | |||||
466 | ||||||
467 - 478 | =head2 add_simple_matchpoint $matcher->add_simple_matchpoint($index, $score, $source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer); Adds a simple matchpoint rule -- after composing a key based on the source tag and subfields, normalized per the normalization fuction, search the index. All records retrieved will receive the assigned score. =cut | |||||
479 | ||||||
480 | sub add_simple_matchpoint { | |||||
481 | 0 | my $self = shift; | ||||
482 | 0 | my ($index, $score, $source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer) = @_; | ||||
483 | ||||||
484 | 0 | $self->add_matchpoint($index, $score, [ | ||||
485 | { tag => $source_tag, subfields => $source_subfields, | |||||
486 | offset => $source_offset, 'length' => $source_length, | |||||
487 | norms => [ $source_normalizer ] | |||||
488 | } | |||||
489 | ]); | |||||
490 | } | |||||
491 | ||||||
492 - 522 | =head2 add_required_check $match->add_required_check($source_matchpoint, $target_matchpoint); Adds a required check definition. A required check means that in order for a match to be considered valid, the key derived from the source (incoming) record must match the key derived from the target (already in DB) record. Unlike a regular matchpoint, only the first repeat of each tag in the source and target match criteria are considered. A typical example of a required check would be verifying that the titles and publication dates match. $source_matchpoint and $target_matchpoint are each a reference to an array of hashes, where each hash follows the same definition as the matchpoint component specification in add_matchpoint, i.e., tag subfields offset length norms The normalization_rules value should in turn be a reference to an array, each element of which should be a reference to a normalization subroutine (under C4::Normalize) to be applied to the source string. =cut | |||||
523 | ||||||
524 | sub add_required_check { | |||||
525 | 0 | my $self = shift; | ||||
526 | 0 | my ($source_matchpoint, $target_matchpoint) = @_; | ||||
527 | ||||||
528 | 0 | my $matchcheck = {}; | ||||
529 | 0 | $matchcheck->{'source_matchpoint'}->{'index'} = ''; | ||||
530 | 0 | $matchcheck->{'source_matchpoint'}->{'score'} = 0; | ||||
531 | 0 | $matchcheck->{'source_matchpoint'}->{'components'} = []; | ||||
532 | 0 | $matchcheck->{'target_matchpoint'}->{'index'} = ''; | ||||
533 | 0 | $matchcheck->{'target_matchpoint'}->{'score'} = 0; | ||||
534 | 0 | $matchcheck->{'target_matchpoint'}->{'components'} = []; | ||||
535 | 0 0 | foreach my $input_component (@{ $source_matchpoint }) { | ||||
536 | 0 0 | push @{ $matchcheck->{'source_matchpoint'}->{'components'} }, _parse_match_component($input_component); | ||||
537 | } | |||||
538 | 0 0 | foreach my $input_component (@{ $target_matchpoint }) { | ||||
539 | 0 0 | push @{ $matchcheck->{'target_matchpoint'}->{'components'} }, _parse_match_component($input_component); | ||||
540 | } | |||||
541 | 0 0 | push @{ $self->{'required_checks'} }, $matchcheck; | ||||
542 | } | |||||
543 | ||||||
544 - 554 | =head2 add_simple_required_check $matcher->add_simple_required_check($source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer, $target_tag, $target_subfields, $target_offset, $target_length, $target_normalizer); Adds a required check, which requires that the normalized keys made from the source and targets must match for a match to be considered valid. =cut | |||||
555 | ||||||
556 | sub add_simple_required_check { | |||||
557 | 0 | my $self = shift; | ||||
558 | 0 | my ($source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer, | ||||
559 | $target_tag, $target_subfields, $target_offset, $target_length, $target_normalizer) = @_; | |||||
560 | ||||||
561 | 0 | $self->add_required_check( | ||||
562 | [ { tag => $source_tag, subfields => $source_subfields, offset => $source_offset, 'length' => $source_length, | |||||
563 | norms => [ $source_normalizer ] } ], | |||||
564 | [ { tag => $target_tag, subfields => $target_subfields, offset => $target_offset, 'length' => $target_length, | |||||
565 | norms => [ $target_normalizer ] } ] | |||||
566 | ); | |||||
567 | } | |||||
568 | ||||||
569 - 593 | =head2 find_matches my @matches = $matcher->get_matches($marc_record, $max_matches); foreach $match (@matches) { # matches already sorted in order of # decreasing score print "record ID: $match->{'record_id'}; print "score: $match->{'score'}; } Identifies all of the records matching the given MARC record. For a record already in the database to be considered a match, it must meet the following criteria: =over 2 =item 1. Total score from its matching field must exceed the supplied threshold. =item 2. It must pass all required checks. =back Only the top $max_matches matches are returned. The returned array is sorted in order of decreasing score, i.e., the best match is first. =cut | |||||
594 | ||||||
595 | sub get_matches { | |||||
596 | 0 | my $self = shift; | ||||
597 | 0 | my ($source_record, $max_matches) = @_; | ||||
598 | ||||||
599 | 0 | my %matches = (); | ||||
600 | ||||||
601 | 0 0 | foreach my $matchpoint (@{ $self->{'matchpoints'} }) { | ||||
602 | 0 | my @source_keys = _get_match_keys($source_record, $matchpoint); | ||||
603 | 0 | next if scalar(@source_keys) == 0; | ||||
604 | # build query | |||||
605 | 0 0 | my $query = join(" or ", map { "$matchpoint->{'index'}=$_" } @source_keys); | ||||
606 | # FIXME only searching biblio index at the moment | |||||
607 | 0 | my ($error, $searchresults, $total_hits) = SimpleSearch($query, 0, $max_matches); | ||||
608 | ||||||
609 | 0 | if (defined $error ) { | ||||
610 | 0 | warn "search failed ($query) $error"; | ||||
611 | } else { | |||||
612 | 0 0 | foreach my $matched (@{$searchresults}) { | ||||
613 | 0 | $matches{$matched} += $matchpoint->{'score'}; | ||||
614 | } | |||||
615 | } | |||||
616 | } | |||||
617 | ||||||
618 | # get rid of any that don't meet the threshold | |||||
619 | 0 0 | %matches = map { ($matches{$_} >= $self->{'threshold'}) ? ($_ => $matches{$_}) : () } keys %matches; | ||||
620 | ||||||
621 | # get rid of any that don't meet the required checks | |||||
622 | 0 0 | %matches = map { _passes_required_checks($source_record, $_, $self->{'required_checks'}) ? ($_ => $matches{$_}) : () } | ||||
623 | keys %matches; | |||||
624 | ||||||
625 | 0 | my @results = (); | ||||
626 | 0 | foreach my $marcblob (keys %matches) { | ||||
627 | 0 | my $target_record = MARC::Record->new_from_usmarc($marcblob); | ||||
628 | 0 | my $result = TransformMarcToKoha(C4::Context->dbh, $target_record, ''); | ||||
629 | # FIXME - again, bibliospecific | |||||
630 | # also, can search engine be induced to give just the number in the first place? | |||||
631 | 0 | my $record_number = $result->{'biblionumber'}; | ||||
632 | 0 | push @results, { 'record_id' => $record_number, 'score' => $matches{$marcblob} }; | ||||
633 | } | |||||
634 | 0 0 | @results = sort { $b->{'score'} cmp $a->{'score'} } @results; | ||||
635 | 0 | if (scalar(@results) > $max_matches) { | ||||
636 | 0 | @results = @results[0..$max_matches-1]; | ||||
637 | } | |||||
638 | 0 | return @results; | ||||
639 | ||||||
640 | } | |||||
641 | ||||||
642 - 650 | =head2 dump $description = $matcher->dump(); Returns a reference to a structure containing all of the information in the matcher object. This is mainly a convenience method to aid setting up a HTML editing form. =cut | |||||
651 | ||||||
652 | sub dump { | |||||
653 | 0 | my $self = shift; | ||||
654 | ||||||
655 | 0 | my $result = {}; | ||||
656 | ||||||
657 | 0 | $result->{'matcher_id'} = $self->{'id'}; | ||||
658 | 0 | $result->{'code'} = $self->{'code'}; | ||||
659 | 0 | $result->{'description'} = $self->{'description'}; | ||||
660 | ||||||
661 | 0 | $result->{'matchpoints'} = []; | ||||
662 | 0 0 | foreach my $matchpoint (@{ $self->{'matchpoints'} }) { | ||||
663 | 0 0 | push @{ $result->{'matchpoints'} }, $matchpoint; | ||||
664 | } | |||||
665 | 0 | $result->{'matchchecks'} = []; | ||||
666 | 0 0 | foreach my $matchcheck (@{ $self->{'required_checks'} }) { | ||||
667 | 0 0 | push @{ $result->{'matchchecks'} }, $matchcheck; | ||||
668 | } | |||||
669 | ||||||
670 | 0 | return $result; | ||||
671 | } | |||||
672 | ||||||
673 | sub _passes_required_checks { | |||||
674 | 0 | my ($source_record, $target_blob, $matchchecks) = @_; | ||||
675 | 0 | my $target_record = MARC::Record->new_from_usmarc($target_blob); # FIXME -- need to avoid parsing record twice | ||||
676 | ||||||
677 | # no checks supplied == automatic pass | |||||
678 | 0 0 | return 1 if $#{ $matchchecks } == -1; | ||||
679 | ||||||
680 | 0 0 | foreach my $matchcheck (@{ $matchchecks }) { | ||||
681 | 0 | my $source_key = join "", _get_match_keys($source_record, $matchcheck->{'source_matchpoint'}); | ||||
682 | 0 | my $target_key = join "", _get_match_keys($target_record, $matchcheck->{'target_matchpoint'}); | ||||
683 | 0 | return 0 unless $source_key eq $target_key; | ||||
684 | } | |||||
685 | 0 | return 1; | ||||
686 | } | |||||
687 | ||||||
688 | sub _get_match_keys { | |||||
689 | 0 | my $source_record = shift; | ||||
690 | 0 | my $matchpoint = shift; | ||||
691 | 0 | my $check_only_first_repeat = @_ ? shift : 0; | ||||
692 | ||||||
693 | # If there is more than one component to the matchpoint (e.g., | |||||
694 | # matchpoint includes both 003 and 001), any repeats | |||||
695 | # of the first component's tag are identified; repeats | |||||
696 | # of the subsequent components' tags are appended to | |||||
697 | # each parallel key dervied from the first component, | |||||
698 | # up to the number of repeats of the first component's tag. | |||||
699 | # | |||||
700 | # For example, if the record has one 003 and two 001s, only | |||||
701 | # one key is retrieved because there is only one 003. The key | |||||
702 | # will consist of the contents of the first 003 and first 001. | |||||
703 | # | |||||
704 | # If there are two 003s and two 001s, there will be two keys: | |||||
705 | # first 003 + first 001 | |||||
706 | # second 003 + second 001 | |||||
707 | ||||||
708 | 0 | my @keys = (); | ||||
709 | 0 | for (my $i = 0; $i <= $#{ $matchpoint->{'components'} }; $i++) { | ||||
710 | 0 | my $component = $matchpoint->{'components'}->[$i]; | ||||
711 | 0 | my $j = -1; | ||||
712 | 0 | FIELD: foreach my $field ($source_record->field($component->{'tag'})) { | ||||
713 | 0 | $j++; | ||||
714 | 0 | last FIELD if $j > 0 and $check_only_first_repeat; | ||||
715 | 0 | last FIELD if $i > 0 and $j > $#keys; | ||||
716 | 0 | my $key = ""; | ||||
717 | 0 | my $string; | ||||
718 | 0 | if ($field->is_control_field()) { | ||||
719 | 0 | $string=$field->data(); | ||||
720 | } else { | |||||
721 | 0 | foreach my $subfield ($field->subfields()) { | ||||
722 | 0 | if (exists $component->{'subfields'}->{$subfield->[0]}) { | ||||
723 | 0 | $string .= " " . $subfield->[1]; | ||||
724 | } | |||||
725 | } | |||||
726 | } | |||||
727 | 0 | if ($component->{'length'}>0) { | ||||
728 | 0 | $string= substr($string, $component->{'offset'}, $component->{'length'}); | ||||
729 | # FIXME normalize, substr | |||||
730 | } elsif ($component->{'offset'}) { | |||||
731 | 0 | $string= substr($string, $component->{'offset'}); | ||||
732 | } | |||||
733 | 0 | $key = _normalize($string); | ||||
734 | 0 | if ($i == 0) { | ||||
735 | 0 | push @keys, $key if $key; | ||||
736 | } else { | |||||
737 | 0 | $keys[$j] .= " $key" if $key; | ||||
738 | } | |||||
739 | } | |||||
740 | 0 | } | ||||
741 | 0 | return @keys; | ||||
742 | } | |||||
743 | ||||||
744 | ||||||
745 | sub _parse_match_component { | |||||
746 | 0 | my $input_component = shift; | ||||
747 | ||||||
748 | 0 | my $component = {}; | ||||
749 | 0 | $component->{'tag'} = $input_component->{'tag'}; | ||||
750 | 0 0 | $component->{'subfields'} = { map { $_ => 1 } split(//, $input_component->{'subfields'}) }; | ||||
751 | 0 | $component->{'offset'} = exists($input_component->{'offset'}) ? $input_component->{'offset'} : -1; | ||||
752 | 0 | $component->{'length'} = $input_component->{'length'} ? $input_component->{'length'} : 0; | ||||
753 | 0 | $component->{'norms'} = $input_component->{'norms'} ? $input_component->{'norms'} : []; | ||||
754 | ||||||
755 | 0 | return $component; | ||||
756 | } | |||||
757 | ||||||
758 | # FIXME - default normalizer | |||||
759 | sub _normalize { | |||||
760 | 0 | my $value = uc shift; | ||||
761 | 0 | $value =~ s/[.;:,\]\[\)\(\/'"]//g; | ||||
762 | 0 | $value =~ s/^\s+//; | ||||
763 | #$value =~ s/^\s+$//; | |||||
764 | 0 | $value =~ s/\s+$//; | ||||
765 | 0 | $value =~ s/\s+/ /g; | ||||
766 | #$value =~ s/[.;,\]\[\)\(\/"']//g; | |||||
767 | 0 | return $value; | ||||
768 | } | |||||
769 | ||||||
770 | 1; |