File Coverage

File:	C4/Charset.pm
Coverage:	9.3%

line	stmt	bran	cond	sub	time	code
1						package C4::Charset;
2
3						# Copyright (C) 2008 LibLime
4						#
5						# This file is part of Koha.
6						#
7						# Koha is free software; you can redistribute it and/or modify it under the
8						# terms of the GNU General Public License as published by the Free Software
9						# Foundation; either version 2 of the License, or (at your option) any later
10						# version.
11						#
12						# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13						# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14						# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15						#
16						# You should have received a copy of the GNU General Public License along
17						# with Koha; if not, write to the Free Software Foundation, Inc.,
18						# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19
20	20 20 20			20	486 72 690	use strict;
21	20 20 20			20	146 77 874	use warnings;
22
23	20 20 20			20	350 40528 1239	use MARC::Charset qw/marc8_to_utf8/;
24	20 20 20			20	29613 38121 840	use Text::Iconv;
25	20 20 20			20	421 51 1572	use C4::Debug;
26	20 20 20			20	97 30 1345	use Unicode::Normalize;
27
28	20 20 20			20	91 29 2893	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
29
30						BEGIN {
31						# set the version for version checking
32	20			20	96	$VERSION = 3.01;
33	20				113	require Exporter;
34	20				217	@ISA = qw(Exporter);
35	20				106544	@EXPORT = qw(
36						NormalizeString
37						IsStringUTF8ish
38						MarcToUTF8Record
39						SetUTF8Flag
40						SetMarcUnicodeFlag
41						StripNonXmlChars
42						nsb_clean
43						);
44						}
45
46 - 75						=head1 NAME C4::Charset - utilities for handling character set conversions. =head1 SYNOPSIS use C4::Charset; =head1 DESCRIPTION This module contains routines for dealing with character set conversions, particularly for MARC records. A variety of character encodings are in use by various MARC standards, and even more character encodings are used by non-standard MARC records. The various MARC formats generally do not do a good job of advertising a given record's character encoding, and even when a record does advertise its encoding, e.g., via the Leader/09, experience has shown that one cannot trust it. Ultimately, all MARC records are stored in Koha in UTF-8 and must be converted from whatever the source character encoding is. The goal of this module is to ensure that these conversions take place accurately. When a character conversion cannot take place, or at least not accurately, the module was provide enough information to allow user-facing code to inform the user on how to deal with the situation. =cut
76
77 - 105						=head1 FUNCTIONS =head2 IsStringUTF8ish my $is_utf8 = IsStringUTF8ish($str); Determines if C<$str> is valid UTF-8. This can mean one of two things: =over =item * The Perl UTF-8 flag is set and the string contains valid UTF-8. =item * The Perl UTF-8 flag is B<not> set, but the octets contain valid UTF-8. =back The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8> because in one could be presented with a MARC blob that is not actually in UTF-8 but whose sequence of octets appears to be valid UTF-8. The rest of the MARC character conversion functions will assume that this situation occur does not very often. =cut
106
107						sub IsStringUTF8ish {
108	3			3	44	my $str = shift;
109
110	3	50			49	return 1 if utf8::is_utf8($str);
111	3				63	return utf8::decode($str);
112						}
113
114 - 131						=head2 SetUTF8Flag my $marc_record = SetUTF8Flag($marc_record, $nfd); This function sets the PERL UTF8 flag for data. It is required when using new_from_usmarc since MARC::File::USMARC does not handle PERL UTF8 setting. When editing unicode marc records fields and subfields, you would end up in double encoding without using this function. If $nfd is set, string normalization will use NFD instead of NFC FIXME In my opinion, this function belongs to MARC::Record and not to this package. But since it handles charset, and MARC::Record, it finds its way in that package =cut
132
133						sub SetUTF8Flag{
134	0			0		my ($record, $nfd)=@_;
135	0	0	0			return unless ($record && $record->fields());
136	0					foreach my $field ($record->fields()){
137	0	0				if ($field->tag()>=10){
138	0					my @subfields;
139	0					foreach my $subfield ($field->subfields()){
140	0					push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
141						}
142	0					my $newfield=MARC::Field->new(
143						$field->tag(),
144						$field->indicator(1),
145						$field->indicator(2),
146						@subfields
147						);
148	0					$field->replace_with($newfield);
149						}
150						}
151						}
152
153 - 170						=head2 NormalizeString my $normalized_string=NormalizeString($string,$nfd,$transform); Given a string nfd : If you want to set NFD and not NFC transform : If you expect all the signs to be removed Sets the PERL UTF8 Flag on your initial data if need be and applies cleaning if required Returns a utf8 NFC normalized string Sample code : my $string=NormalizeString ("l'ornithoptère"); #results into ornithoptère in NFC form and sets UTF8 Flag =cut
171
172
173						sub NormalizeString{
174	0			0		my ($string,$nfd,$transform)=@_;
175	0	0				utf8::decode($string) unless (utf8::is_utf8($string));
176	0	0				if ($nfd){
177	0					$string= NFD($string);
178						}
179						else {
180	0					$string=NFC($string);
181						}
182	0	0				if ($transform){
183	0					$string=~s/\<\|\>\|\^\|\;\|\.\|\?\|,\|\-\|$\|$\|\[\|\]\|\{\|\}\|\$\|\%\|\!\|\*\|\:\|\\\|\/\|\&\|\"\|\'/ /g;
184						#removing one letter words "d'" "l'" was changed into "d " "l "
185	0					$string=~s/\b\S\b//g;
186	0					$string=~s/\s+$//g;
187						}
188	0					return $string;
189						}
190
191 - 210						=head2 MarcToUTF8Record ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]); Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an optional source encoding, return a C<MARC::Record> that is converted to UTF-8. The returned C<$marc_record> is guaranteed to be in valid UTF-8, but is not guaranteed to have been converted correctly. Specifically, if C<$converted_from> is 'failed', the MARC record returned failed character conversion and had each of its non-ASCII octets changed to the Unicode replacement character. If the source encoding was not specified, this routine will try to guess it; the character encoding used for a successful conversion is returned in C<$converted_from>. =cut
211
212						sub MarcToUTF8Record {
213	0			0		my $marc = shift;
214	0					my $marc_flavour = shift;
215	0					my $source_encoding = shift;
216	0					my $marc_record;
217	0					my $marc_blob_is_utf8 = 0;
218	0	0				if (ref($marc) eq 'MARC::Record') {
219	0					my $marc_blob = $marc->as_usmarc();
220	0					$marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
221	0					$marc_record = $marc;
222						} else {
223						# dealing with a MARC blob
224
225						# remove any ersatz whitespace from the beginning and
226						# end of the MARC blob -- these can creep into MARC
227						# files produced by several sources -- caller really
228						# should be doing this, however
229	0					$marc =~ s/^\s+//;
230	0					$marc =~ s/\s+$//;
231	0					$marc_blob_is_utf8 = IsStringUTF8ish($marc);
232	0					eval {
233	0					$marc_record = MARC::Record->new_from_usmarc($marc);
234						};
235	0	0				if ($@) {
236						# if we fail the first time, one likely problem
237						# is that we have a MARC21 record that says that it's
238						# UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
239						# We'll try parsing it again.
240	0					substr($marc, 9, 1) = ' ';
241	0					eval {
242	0					$marc_record = MARC::Record->new_from_usmarc($marc);
243						};
244	0	0				if ($@) {
245						# it's hopeless; return an empty MARC::Record
246	0					return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
247						}
248						}
249						}
250
251						# If we do not know the source encoding, try some guesses
252						# as follows:
253						# 1. Record is UTF-8 already.
254						# 2. If MARC flavor is MARC21 or NORMARC, then
255						# a. record is MARC-8
256						# b. record is ISO-8859-1
257						# 3. If MARC flavor is UNIMARC, then
258	0	0				if (not defined $source_encoding) {
259	0	0				if ($marc_blob_is_utf8) {
260						# note that for MARC21/NORMARC we are not bothering to check
261						# if the Leader/09 is set to 'a' or not -- because
262						# of problems with various ILSs (including Koha in the
263						# past, alas), this just is not trustworthy.
264	0					SetMarcUnicodeFlag($marc_record, $marc_flavour);
265	0					return $marc_record, 'UTF-8', [];
266						} else {
267	0	0 0	0			if ($marc_flavour eq 'MARC21' \|\| $marc_flavour eq 'NORMARC') {
268	0					return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
269						} elsif ($marc_flavour =~/UNIMARC/) {
270	0					return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
271						} else {
272	0					return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
273						}
274						}
275						} else {
276						# caller knows the character encoding
277	0					my $original_marc_record = $marc_record->clone();
278	0					my @errors;
279	0	0 0 0				if ($source_encoding =~ /utf-?8/i) {
280	0	0				if ($marc_blob_is_utf8) {
281	0					SetMarcUnicodeFlag($marc_record, $marc_flavour);
282	0					return $marc_record, 'UTF-8', [];
283						} else {
284	0					push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
285						}
286						} elsif ($source_encoding =~ /marc-?8/i) {
287	0					@errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
288						} elsif ($source_encoding =~ /5426/) {
289	0					@errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
290						} else {
291						# assume any other character encoding is for Text::Iconv
292	0					@errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
293						}
294
295	0	0				if (@errors) {
296	0					_marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
297	0					return $original_marc_record, 'failed', \@errors;
298						} else {
299	0					return $marc_record, $source_encoding, [];
300						}
301						}
302
303						}
304
305 - 315						=head2 SetMarcUnicodeFlag SetMarcUnicodeFlag($marc_record, $marc_flavour); Set both the internal MARC::Record encoding flag and the appropriate Leader/09 (MARC21) or 100/26-29 (UNIMARC) to indicate that the record is in UTF-8. Note that this does B<not> do any actual character conversion. =cut
316
317						sub SetMarcUnicodeFlag {
318	0			0		my $marc_record = shift;
319	0					my $marc_flavour = shift; # \|\| C4::Context->preference("marcflavour");
320
321	0					$marc_record->encoding('UTF-8');
322	0	0 0	0			if ($marc_flavour eq 'MARC21' \|\| $marc_flavour eq 'NORMARC') {
323	0					my $leader = $marc_record->leader();
324	0					substr($leader, 9, 1) = 'a';
325	0					$marc_record->leader($leader);
326						} elsif ($marc_flavour =~/UNIMARC/) {
327	0					my $string;
328	0	0				my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
329	0					$string=$marc_record->subfield( 100, "a" );
330	0	0	0			if (defined $string && length($string)==$subflength) {
331	0	0				$string = substr $string, 0,$subflength if (length($string)>$subflength);
332						}
333						else {
334	0					$string = POSIX::strftime( "%Y%m%d", localtime );
335	0					$string =~ s/\-//g;
336	0					$string = sprintf( "%-*s", $subflength, $string );
337						}
338	0					substr( $string, $encodingposition, 8, "frey50 " );
339	0	0				if ( $marc_record->subfield( 100, "a" ) ) {
340	0					$marc_record->field('100')->update(a=>$string);
341						}
342						else {
343	0					$marc_record->insert_grouped_field(
344						MARC::Field->new( 100, '', '', "a" => $string ) );
345						}
346	0	0				$debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
347						} else {
348	0					warn "Unrecognized marcflavour: $marc_flavour";
349						}
350						}
351
352 - 375						=head2 StripNonXmlChars my $new_str = StripNonXmlChars($old_str); Given a string, return a copy with the characters that are illegal in XML removed. This function exists to work around a problem that can occur with badly-encoded MARC records. Specifically, if a UTF-8 MARC record also has excape (\x1b) characters, MARC::File::XML will let the escape characters pass through when as_xml() or as_xml_record() is called. The problem is that the escape character is not legal in well-formed XML documents, so when MARC::File::XML attempts to parse such a record, the XML parser will fail. Stripping such characters will allow a MARC::Record->new_from_xml() to work, at the possible risk of some data loss. =cut
376
377						sub StripNonXmlChars {
378	0			0		my $str = shift;
379	0	0	0			if (!defined($str) \|\| $str eq ""){
380	0					return "";
381						}
382	0					$str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
383	0					return $str;
384						}
385
386
387
388 - 398						=head2 nsb_clean =over 4 nsb_clean($string); =back Removes Non Sorting Block characters =cut
399						sub nsb_clean {
400	0			0		my $NSB = '\x88' ; # NSB : begin Non Sorting Block
401	0					my $NSE = '\x89' ; # NSE : Non Sorting Block end
402	0					my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block
403	0					my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end
404	0					my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
405
406						# handles non sorting blocks
407	0					my ($string) = @_ ;
408	0					$_ = $string ;
409	0					s/$NSB//g ;
410	0					s/$NSE//g ;
411	0					s/$NSB2//g ;
412	0					s/$NSE2//g ;
413	0					s/$C2//g ;
414	0					$string = $_ ;
415
416	0					return($string) ;
417						}
418
419
420 - 436						=head1 INTERNAL FUNCTIONS =head2 _default_marc21_charconv_to_utf8 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record); Converts a C<MARC::Record> of unknown character set to UTF-8, first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1 to UTF-8, then a default conversion that replaces each non-ASCII character with the replacement character. The C<$guessed_charset> return value contains the character set that resulted in a conversion to valid UTF-8; note that if the MARC-8 and ISO-8859-1 conversions failed, the value of this is 'failed'. =cut
437
438						sub _default_marc21_charconv_to_utf8 {
439	0			0		my $marc_record = shift;
440	0					my $marc_flavour = shift;
441
442	0					my $trial_marc8 = $marc_record->clone();
443	0					my @all_errors = ();
444	0					my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
445	0	0				unless (@errors) {
446	0					return $trial_marc8, 'MARC-8', [];
447						}
448	0					push @all_errors, @errors;
449
450	0					my $trial_8859_1 = $marc_record->clone();
451	0					@errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
452	0	0				unless (@errors) {
453	0					return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
454						# instead if we wanted to report details
455						# of the failed attempt at MARC-8 => UTF-8
456						}
457	0					push @all_errors, @errors;
458
459	0					my $default_converted = $marc_record->clone();
460	0					_marc_to_utf8_replacement_char($default_converted, $marc_flavour);
461	0					return $default_converted, 'failed', \@all_errors;
462						}
463
464 - 478						=head2 _default_unimarc_charconv_to_utf8 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record); Converts a C<MARC::Record> of unknown character set to UTF-8, first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1 to UTF-8, then a default conversion that replaces each non-ASCII character with the replacement character. The C<$guessed_charset> return value contains the character set that resulted in a conversion to valid UTF-8; note that if the MARC-8 and ISO-8859-1 conversions failed, the value of this is 'failed'. =cut
479
480						sub _default_unimarc_charconv_to_utf8 {
481	0			0		my $marc_record = shift;
482	0					my $marc_flavour = shift;
483
484	0					my $trial_marc8 = $marc_record->clone();
485	0					my @all_errors = ();
486	0					my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
487	0	0				unless (@errors) {
488	0					return $trial_marc8, 'iso-5426';
489						}
490	0					push @all_errors, @errors;
491
492	0					my $trial_8859_1 = $marc_record->clone();
493	0					@errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
494	0	0				unless (@errors) {
495	0					return $trial_8859_1, 'iso-8859-1';
496						}
497	0					push @all_errors, @errors;
498
499	0					my $default_converted = $marc_record->clone();
500	0					_marc_to_utf8_replacement_char($default_converted, $marc_flavour);
501	0					return $default_converted, 'failed', \@all_errors;
502						}
503
504 - 513						=head2 _marc_marc8_to_utf8 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding); Convert a C<MARC::Record> to UTF-8 in-place from MARC-8. If the conversion fails for some reason, an appropriate messages will be placed in the returned C<@errors> array. =cut
514
515						sub _marc_marc8_to_utf8 {
516	0			0		my $marc_record = shift;
517	0					my $marc_flavour = shift;
518
519	0					my $prev_ignore = MARC::Charset->ignore_errors();
520	0					MARC::Charset->ignore_errors(1);
521
522						# trap warnings raised by MARC::Charset
523	0					my @errors = ();
524						local $SIG{__WARN__} = sub {
525	0			0		my $msg = $_[0];
526	0	0				if ($msg =~ /MARC.Charset/) {
527						# FIXME - purpose of this regexp is to strip out the
528						# line reference to MARC/Charset.pm, but as it
529						# exists probably won't work quite on Windows --
530						# some sort of minimal-bunch back-tracking RE
531						# would be helpful here
532	0					$msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
533	0					push @errors, $msg;
534						} else {
535						# if warning doesn't come from MARC::Charset, just
536						# pass it on
537	0					warn $msg;
538						}
539	0					};
540
541	0					foreach my $field ($marc_record->fields()) {
542	0	0				if ($field->is_control_field()) {
543						; # do nothing -- control fields should not contain non-ASCII characters
544						} else {
545	0					my @converted_subfields;
546	0					foreach my $subfield ($field->subfields()) {
547	0					my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
548	0	0				unless (IsStringUTF8ish($utf8sf)) {
549						# Because of a bug in MARC::Charset 0.98, if the string
550						# has (a) one or more diacritics that (b) are only in character positions
551						# 128 to 255 inclusive, the resulting converted string is not in
552						# UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
553						# occurs, upgrade the string in place. Moral of the story seems to be
554						# that pack("U", ...) is better than chr(...) if you need to guarantee
555						# that the resulting string is UTF-8.
556	0					utf8::upgrade($utf8sf);
557						}
558	0					push @converted_subfields, $subfield->[0], $utf8sf;
559						}
560
561	0					$field->replace_with(MARC::Field->new(
562						$field->tag(), $field->indicator(1), $field->indicator(2),
563						@converted_subfields)
564						);
565						}
566						}
567
568	0					MARC::Charset->ignore_errors($prev_ignore);
569
570	0					SetMarcUnicodeFlag($marc_record, $marc_flavour);
571
572	0					return @errors;
573						}
574
575 - 587						=head2 _marc_iso5426_to_utf8 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding); Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426. If the conversion fails for some reason, an appropriate messages will be placed in the returned C<@errors> array. FIXME - is ISO-5426 equivalent enough to MARC-8 that C<MARC::Charset> can be used instead? =cut
588
589						sub _marc_iso5426_to_utf8 {
590	0			0		my $marc_record = shift;
591	0					my $marc_flavour = shift;
592
593	0					my @errors = ();
594
595	0					foreach my $field ($marc_record->fields()) {
596	0	0				if ($field->is_control_field()) {
597						; # do nothing -- control fields should not contain non-ASCII characters
598						} else {
599	0					my @converted_subfields;
600	0					foreach my $subfield ($field->subfields()) {
601	0					my $utf8sf = char_decode5426($subfield->[1]);
602	0					push @converted_subfields, $subfield->[0], $utf8sf;
603						}
604
605	0					$field->replace_with(MARC::Field->new(
606						$field->tag(), $field->indicator(1), $field->indicator(2),
607						@converted_subfields)
608						);
609						}
610						}
611
612	0					SetMarcUnicodeFlag($marc_record, $marc_flavour);
613
614	0					return @errors;
615						}
616
617 - 629						=head2 _marc_to_utf8_via_text_iconv my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding); Convert a C<MARC::Record> to UTF-8 in-place using the C<Text::Iconv> CPAN module. Any source encoding accepted by the user's iconv installation should work. If the source encoding is not recognized on the user's server or the conversion fails for some reason, appropriate messages will be placed in the returned C<@errors> array. =cut
630
631						sub _marc_to_utf8_via_text_iconv {
632	0			0		my $marc_record = shift;
633	0					my $marc_flavour = shift;
634	0					my $source_encoding = shift;
635
636	0					my @errors = ();
637	0					my $decoder;
638	0 0					eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
639	0	0				if ($@) {
640	0					push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
641	0					return @errors;
642						}
643
644	0					my $prev_raise_error = Text::Iconv->raise_error();
645	0					Text::Iconv->raise_error(1);
646
647	0					foreach my $field ($marc_record->fields()) {
648	0	0				if ($field->is_control_field()) {
649						; # do nothing -- control fields should not contain non-ASCII characters
650						} else {
651	0					my @converted_subfields;
652	0					foreach my $subfield ($field->subfields()) {
653	0					my $converted_value;
654	0					my $conversion_ok = 1;
655	0 0					eval { $converted_value = $decoder->convert($subfield->[1]); };
656	0	0 0				if ($@) {
657	0					$conversion_ok = 0;
658	0					push @errors, $@;
659						} elsif (not defined $converted_value) {
660	0					$conversion_ok = 0;
661	0					push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
662						}
663
664	0	0				if ($conversion_ok) {
665	0					push @converted_subfields, $subfield->[0], $converted_value;
666						} else {
667	0					$converted_value = $subfield->[1];
668	0					$converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
669	0					push @converted_subfields, $subfield->[0], $converted_value;
670						}
671						}
672
673	0					$field->replace_with(MARC::Field->new(
674						$field->tag(), $field->indicator(1), $field->indicator(2),
675						@converted_subfields)
676						);
677						}
678						}
679
680	0					SetMarcUnicodeFlag($marc_record, $marc_flavour);
681	0					Text::Iconv->raise_error($prev_raise_error);
682
683	0					return @errors;
684						}
685
686 - 698						=head2 _marc_to_utf8_replacement_char _marc_to_utf8_replacement_char($marc_record, $marc_flavour); Convert a C<MARC::Record> to UTF-8 in-place, adopting the unsatisfactory method of replacing all non-ASCII (e.g., where the eight bit is set) octet with the Unicode replacement character. This is meant as a last-ditch method, and would be best used as part of a UI that lets a cataloguer pick various character conversions until he or she finds the right one. =cut
699
700						sub _marc_to_utf8_replacement_char {
701	0			0		my $marc_record = shift;
702	0					my $marc_flavour = shift;
703
704	0					foreach my $field ($marc_record->fields()) {
705	0	0				if ($field->is_control_field()) {
706						; # do nothing -- control fields should not contain non-ASCII characters
707						} else {
708	0					my @converted_subfields;
709	0					foreach my $subfield ($field->subfields()) {
710	0					my $value = $subfield->[1];
711	0					$value =~ s/[\200-\377]/\xef\xbf\xbd/g;
712	0					push @converted_subfields, $subfield->[0], $value;
713						}
714
715	0					$field->replace_with(MARC::Field->new(
716						$field->tag(), $field->indicator(1), $field->indicator(2),
717						@converted_subfields)
718						);
719						}
720						}
721
722	0					SetMarcUnicodeFlag($marc_record, $marc_flavour);
723						}
724
725 - 731						=head2 char_decode5426 my $utf8string = char_decode5426($iso_5426_string); Converts a string from ISO-5426 to UTF-8. =cut
732
733
734						my %chars;
735						$chars{0xb0}=0x0101;#3/0ayn[ain]
736						$chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
737						#$chars{0xb2}=0x00e0;#'à';
738						$chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
739						#$chars{0xb3}=0x00e7;#'ç';
740						$chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
741						# $chars{0xb4}='è';
742						$chars{0xb4}=0x00e8;
743						$chars{0xbd}=0x02b9;
744						$chars{0xbe}=0x02ba;
745						# $chars{0xb5}='é';
746						$chars{0xb5}=0x00e9;
747						$chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
748						$chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
749						$chars{0xfa}=0x0153; #oe
750						$chars{0xea}=0x0152; #oe
751						$chars{0x81d1}=0x00b0;
752
753						####
754						## combined characters iso5426
755
756						$chars{0xc041}=0x1ea2; # capital a with hook above
757						$chars{0xc045}=0x1eba; # capital e with hook above
758						$chars{0xc049}=0x1ec8; # capital i with hook above
759						$chars{0xc04f}=0x1ece; # capital o with hook above
760						$chars{0xc055}=0x1ee6; # capital u with hook above
761						$chars{0xc059}=0x1ef6; # capital y with hook above
762						$chars{0xc061}=0x1ea3; # small a with hook above
763						$chars{0xc065}=0x1ebb; # small e with hook above
764						$chars{0xc069}=0x1ec9; # small i with hook above
765						$chars{0xc06f}=0x1ecf; # small o with hook above
766						$chars{0xc075}=0x1ee7; # small u with hook above
767						$chars{0xc079}=0x1ef7; # small y with hook above
768
769						# 4/1 grave accent
770						$chars{0xc141}=0x00c0; # capital a with grave accent
771						$chars{0xc145}=0x00c8; # capital e with grave accent
772						$chars{0xc149}=0x00cc; # capital i with grave accent
773						$chars{0xc14f}=0x00d2; # capital o with grave accent
774						$chars{0xc155}=0x00d9; # capital u with grave accent
775						$chars{0xc157}=0x1e80; # capital w with grave
776						$chars{0xc159}=0x1ef2; # capital y with grave
777						$chars{0xc161}=0x00e0; # small a with grave accent
778						$chars{0xc165}=0x00e8; # small e with grave accent
779						$chars{0xc169}=0x00ec; # small i with grave accent
780						$chars{0xc16f}=0x00f2; # small o with grave accent
781						$chars{0xc175}=0x00f9; # small u with grave accent
782						$chars{0xc177}=0x1e81; # small w with grave
783						$chars{0xc179}=0x1ef3; # small y with grave
784						# 4/2 acute accent
785						$chars{0xc241}=0x00c1; # capital a with acute accent
786						$chars{0xc243}=0x0106; # capital c with acute accent
787						$chars{0xc245}=0x00c9; # capital e with acute accent
788						$chars{0xc247}=0x01f4; # capital g with acute
789						$chars{0xc249}=0x00cd; # capital i with acute accent
790						$chars{0xc24b}=0x1e30; # capital k with acute
791						$chars{0xc24c}=0x0139; # capital l with acute accent
792						$chars{0xc24d}=0x1e3e; # capital m with acute
793						$chars{0xc24e}=0x0143; # capital n with acute accent
794						$chars{0xc24f}=0x00d3; # capital o with acute accent
795						$chars{0xc250}=0x1e54; # capital p with acute
796						$chars{0xc252}=0x0154; # capital r with acute accent
797						$chars{0xc253}=0x015a; # capital s with acute accent
798						$chars{0xc255}=0x00da; # capital u with acute accent
799						$chars{0xc257}=0x1e82; # capital w with acute
800						$chars{0xc259}=0x00dd; # capital y with acute accent
801						$chars{0xc25a}=0x0179; # capital z with acute accent
802						$chars{0xc261}=0x00e1; # small a with acute accent
803						$chars{0xc263}=0x0107; # small c with acute accent
804						$chars{0xc265}=0x00e9; # small e with acute accent
805						$chars{0xc267}=0x01f5; # small g with acute
806						$chars{0xc269}=0x00ed; # small i with acute accent
807						$chars{0xc26b}=0x1e31; # small k with acute
808						$chars{0xc26c}=0x013a; # small l with acute accent
809						$chars{0xc26d}=0x1e3f; # small m with acute
810						$chars{0xc26e}=0x0144; # small n with acute accent
811						$chars{0xc26f}=0x00f3; # small o with acute accent
812						$chars{0xc270}=0x1e55; # small p with acute
813						$chars{0xc272}=0x0155; # small r with acute accent
814						$chars{0xc273}=0x015b; # small s with acute accent
815						$chars{0xc275}=0x00fa; # small u with acute accent
816						$chars{0xc277}=0x1e83; # small w with acute
817						$chars{0xc279}=0x00fd; # small y with acute accent
818						$chars{0xc27a}=0x017a; # small z with acute accent
819						$chars{0xc2e1}=0x01fc; # capital ae with acute
820						$chars{0xc2f1}=0x01fd; # small ae with acute
821						# 4/3 circumflex accent
822						$chars{0xc341}=0x00c2; # capital a with circumflex accent
823						$chars{0xc343}=0x0108; # capital c with circumflex
824						$chars{0xc345}=0x00ca; # capital e with circumflex accent
825						$chars{0xc347}=0x011c; # capital g with circumflex
826						$chars{0xc348}=0x0124; # capital h with circumflex
827						$chars{0xc349}=0x00ce; # capital i with circumflex accent
828						$chars{0xc34a}=0x0134; # capital j with circumflex
829						$chars{0xc34f}=0x00d4; # capital o with circumflex accent
830						$chars{0xc353}=0x015c; # capital s with circumflex
831						$chars{0xc355}=0x00db; # capital u with circumflex
832						$chars{0xc357}=0x0174; # capital w with circumflex
833						$chars{0xc359}=0x0176; # capital y with circumflex
834						$chars{0xc35a}=0x1e90; # capital z with circumflex
835						$chars{0xc361}=0x00e2; # small a with circumflex accent
836						$chars{0xc363}=0x0109; # small c with circumflex
837						$chars{0xc365}=0x00ea; # small e with circumflex accent
838						$chars{0xc367}=0x011d; # small g with circumflex
839						$chars{0xc368}=0x0125; # small h with circumflex
840						$chars{0xc369}=0x00ee; # small i with circumflex accent
841						$chars{0xc36a}=0x0135; # small j with circumflex
842						$chars{0xc36e}=0x00f1; # small n with tilde
843						$chars{0xc36f}=0x00f4; # small o with circumflex accent
844						$chars{0xc373}=0x015d; # small s with circumflex
845						$chars{0xc375}=0x00fb; # small u with circumflex
846						$chars{0xc377}=0x0175; # small w with circumflex
847						$chars{0xc379}=0x0177; # small y with circumflex
848						$chars{0xc37a}=0x1e91; # small z with circumflex
849						# 4/4 tilde
850						$chars{0xc441}=0x00c3; # capital a with tilde
851						$chars{0xc445}=0x1ebc; # capital e with tilde
852						$chars{0xc449}=0x0128; # capital i with tilde
853						$chars{0xc44e}=0x00d1; # capital n with tilde
854						$chars{0xc44f}=0x00d5; # capital o with tilde
855						$chars{0xc455}=0x0168; # capital u with tilde
856						$chars{0xc456}=0x1e7c; # capital v with tilde
857						$chars{0xc459}=0x1ef8; # capital y with tilde
858						$chars{0xc461}=0x00e3; # small a with tilde
859						$chars{0xc465}=0x1ebd; # small e with tilde
860						$chars{0xc469}=0x0129; # small i with tilde
861						$chars{0xc46e}=0x00f1; # small n with tilde
862						$chars{0xc46f}=0x00f5; # small o with tilde
863						$chars{0xc475}=0x0169; # small u with tilde
864						$chars{0xc476}=0x1e7d; # small v with tilde
865						$chars{0xc479}=0x1ef9; # small y with tilde
866						# 4/5 macron
867						$chars{0xc541}=0x0100; # capital a with macron
868						$chars{0xc545}=0x0112; # capital e with macron
869						$chars{0xc547}=0x1e20; # capital g with macron
870						$chars{0xc549}=0x012a; # capital i with macron
871						$chars{0xc54f}=0x014c; # capital o with macron
872						$chars{0xc555}=0x016a; # capital u with macron
873						$chars{0xc561}=0x0101; # small a with macron
874						$chars{0xc565}=0x0113; # small e with macron
875						$chars{0xc567}=0x1e21; # small g with macron
876						$chars{0xc569}=0x012b; # small i with macron
877						$chars{0xc56f}=0x014d; # small o with macron
878						$chars{0xc575}=0x016b; # small u with macron
879						$chars{0xc572}=0x0159; # small r with macron
880						$chars{0xc5e1}=0x01e2; # capital ae with macron
881						$chars{0xc5f1}=0x01e3; # small ae with macron
882						# 4/6 breve
883						$chars{0xc641}=0x0102; # capital a with breve
884						$chars{0xc645}=0x0114; # capital e with breve
885						$chars{0xc647}=0x011e; # capital g with breve
886						$chars{0xc649}=0x012c; # capital i with breve
887						$chars{0xc64f}=0x014e; # capital o with breve
888						$chars{0xc655}=0x016c; # capital u with breve
889						$chars{0xc661}=0x0103; # small a with breve
890						$chars{0xc665}=0x0115; # small e with breve
891						$chars{0xc667}=0x011f; # small g with breve
892						$chars{0xc669}=0x012d; # small i with breve
893						$chars{0xc66f}=0x014f; # small o with breve
894						$chars{0xc675}=0x016d; # small u with breve
895						# 4/7 dot above
896						$chars{0xc7b0}=0x01e1; # Ain with dot above
897						$chars{0xc742}=0x1e02; # capital b with dot above
898						$chars{0xc743}=0x010a; # capital c with dot above
899						$chars{0xc744}=0x1e0a; # capital d with dot above
900						$chars{0xc745}=0x0116; # capital e with dot above
901						$chars{0xc746}=0x1e1e; # capital f with dot above
902						$chars{0xc747}=0x0120; # capital g with dot above
903						$chars{0xc748}=0x1e22; # capital h with dot above
904						$chars{0xc749}=0x0130; # capital i with dot above
905						$chars{0xc74d}=0x1e40; # capital m with dot above
906						$chars{0xc74e}=0x1e44; # capital n with dot above
907						$chars{0xc750}=0x1e56; # capital p with dot above
908						$chars{0xc752}=0x1e58; # capital r with dot above
909						$chars{0xc753}=0x1e60; # capital s with dot above
910						$chars{0xc754}=0x1e6a; # capital t with dot above
911						$chars{0xc757}=0x1e86; # capital w with dot above
912						$chars{0xc758}=0x1e8a; # capital x with dot above
913						$chars{0xc759}=0x1e8e; # capital y with dot above
914						$chars{0xc75a}=0x017b; # capital z with dot above
915						$chars{0xc761}=0x0227; # small b with dot above
916						$chars{0xc762}=0x1e03; # small b with dot above
917						$chars{0xc763}=0x010b; # small c with dot above
918						$chars{0xc764}=0x1e0b; # small d with dot above
919						$chars{0xc765}=0x0117; # small e with dot above
920						$chars{0xc766}=0x1e1f; # small f with dot above
921						$chars{0xc767}=0x0121; # small g with dot above
922						$chars{0xc768}=0x1e23; # small h with dot above
923						$chars{0xc76d}=0x1e41; # small m with dot above
924						$chars{0xc76e}=0x1e45; # small n with dot above
925						$chars{0xc770}=0x1e57; # small p with dot above
926						$chars{0xc772}=0x1e59; # small r with dot above
927						$chars{0xc773}=0x1e61; # small s with dot above
928						$chars{0xc774}=0x1e6b; # small t with dot above
929						$chars{0xc777}=0x1e87; # small w with dot above
930						$chars{0xc778}=0x1e8b; # small x with dot above
931						$chars{0xc779}=0x1e8f; # small y with dot above
932						$chars{0xc77a}=0x017c; # small z with dot above
933						# 4/8 trema, diaresis
934						$chars{0xc820}=0x00a8; # diaeresis
935						$chars{0xc841}=0x00c4; # capital a with diaeresis
936						$chars{0xc845}=0x00cb; # capital e with diaeresis
937						$chars{0xc848}=0x1e26; # capital h with diaeresis
938						$chars{0xc849}=0x00cf; # capital i with diaeresis
939						$chars{0xc84f}=0x00d6; # capital o with diaeresis
940						$chars{0xc855}=0x00dc; # capital u with diaeresis
941						$chars{0xc857}=0x1e84; # capital w with diaeresis
942						$chars{0xc858}=0x1e8c; # capital x with diaeresis
943						$chars{0xc859}=0x0178; # capital y with diaeresis
944						$chars{0xc861}=0x00e4; # small a with diaeresis
945						$chars{0xc865}=0x00eb; # small e with diaeresis
946						$chars{0xc868}=0x1e27; # small h with diaeresis
947						$chars{0xc869}=0x00ef; # small i with diaeresis
948						$chars{0xc86f}=0x00f6; # small o with diaeresis
949						$chars{0xc874}=0x1e97; # small t with diaeresis
950						$chars{0xc875}=0x00fc; # small u with diaeresis
951						$chars{0xc877}=0x1e85; # small w with diaeresis
952						$chars{0xc878}=0x1e8d; # small x with diaeresis
953						$chars{0xc879}=0x00ff; # small y with diaeresis
954						# 4/9 umlaut
955						$chars{0xc920}=0x00a8; # [diaeresis]
956						$chars{0xc961}=0x00e4; # a with umlaut
957						$chars{0xc965}=0x00eb; # e with umlaut
958						$chars{0xc969}=0x00ef; # i with umlaut
959						$chars{0xc96f}=0x00f6; # o with umlaut
960						$chars{0xc975}=0x00fc; # u with umlaut
961						# 4/10 circle above
962						$chars{0xca41}=0x00c5; # capital a with ring above
963						$chars{0xcaad}=0x016e; # capital u with ring above
964						$chars{0xca61}=0x00e5; # small a with ring above
965						$chars{0xca75}=0x016f; # small u with ring above
966						$chars{0xca77}=0x1e98; # small w with ring above
967						$chars{0xca79}=0x1e99; # small y with ring above
968						# 4/11 high comma off centre
969						# 4/12 inverted high comma centred
970						# 4/13 double acute accent
971						$chars{0xcd4f}=0x0150; # capital o with double acute
972						$chars{0xcd55}=0x0170; # capital u with double acute
973						$chars{0xcd6f}=0x0151; # small o with double acute
974						$chars{0xcd75}=0x0171; # small u with double acute
975						# 4/14 horn
976						$chars{0xce54}=0x01a0; # latin capital letter o with horn
977						$chars{0xce55}=0x01af; # latin capital letter u with horn
978						$chars{0xce74}=0x01a1; # latin small letter o with horn
979						$chars{0xce75}=0x01b0; # latin small letter u with horn
980						# 4/15 caron (hacek
981						$chars{0xcf41}=0x01cd; # capital a with caron
982						$chars{0xcf43}=0x010c; # capital c with caron
983						$chars{0xcf44}=0x010e; # capital d with caron
984						$chars{0xcf45}=0x011a; # capital e with caron
985						$chars{0xcf47}=0x01e6; # capital g with caron
986						$chars{0xcf49}=0x01cf; # capital i with caron
987						$chars{0xcf4b}=0x01e8; # capital k with caron
988						$chars{0xcf4c}=0x013d; # capital l with caron
989						$chars{0xcf4e}=0x0147; # capital n with caron
990						$chars{0xcf4f}=0x01d1; # capital o with caron
991						$chars{0xcf52}=0x0158; # capital r with caron
992						$chars{0xcf53}=0x0160; # capital s with caron
993						$chars{0xcf54}=0x0164; # capital t with caron
994						$chars{0xcf55}=0x01d3; # capital u with caron
995						$chars{0xcf5a}=0x017d; # capital z with caron
996						$chars{0xcf61}=0x01ce; # small a with caron
997						$chars{0xcf63}=0x010d; # small c with caron
998						$chars{0xcf64}=0x010f; # small d with caron
999						$chars{0xcf65}=0x011b; # small e with caron
1000						$chars{0xcf67}=0x01e7; # small g with caron
1001						$chars{0xcf69}=0x01d0; # small i with caron
1002						$chars{0xcf6a}=0x01f0; # small j with caron
1003						$chars{0xcf6b}=0x01e9; # small k with caron
1004						$chars{0xcf6c}=0x013e; # small l with caron
1005						$chars{0xcf6e}=0x0148; # small n with caron
1006						$chars{0xcf6f}=0x01d2; # small o with caron
1007						$chars{0xcf72}=0x0159; # small r with caron
1008						$chars{0xcf73}=0x0161; # small s with caron
1009						$chars{0xcf74}=0x0165; # small t with caron
1010						$chars{0xcf75}=0x01d4; # small u with caron
1011						$chars{0xcf7a}=0x017e; # small z with caron
1012						# 5/0 cedilla
1013						$chars{0xd020}=0x00b8; # cedilla
1014						$chars{0xd043}=0x00c7; # capital c with cedilla
1015						$chars{0xd044}=0x1e10; # capital d with cedilla
1016						$chars{0xd047}=0x0122; # capital g with cedilla
1017						$chars{0xd048}=0x1e28; # capital h with cedilla
1018						$chars{0xd04b}=0x0136; # capital k with cedilla
1019						$chars{0xd04c}=0x013b; # capital l with cedilla
1020						$chars{0xd04e}=0x0145; # capital n with cedilla
1021						$chars{0xd052}=0x0156; # capital r with cedilla
1022						$chars{0xd053}=0x015e; # capital s with cedilla
1023						$chars{0xd054}=0x0162; # capital t with cedilla
1024						$chars{0xd063}=0x00e7; # small c with cedilla
1025						$chars{0xd064}=0x1e11; # small d with cedilla
1026						$chars{0xd065}=0x0119; # small e with cedilla
1027						$chars{0xd067}=0x0123; # small g with cedilla
1028						$chars{0xd068}=0x1e29; # small h with cedilla
1029						$chars{0xd06b}=0x0137; # small k with cedilla
1030						$chars{0xd06c}=0x013c; # small l with cedilla
1031						$chars{0xd06e}=0x0146; # small n with cedilla
1032						$chars{0xd072}=0x0157; # small r with cedilla
1033						$chars{0xd073}=0x015f; # small s with cedilla
1034						$chars{0xd074}=0x0163; # small t with cedilla
1035						# 5/1 rude
1036						# 5/2 hook to left
1037						# 5/3 ogonek (hook to right
1038						$chars{0xd320}=0x02db; # ogonek
1039						$chars{0xd341}=0x0104; # capital a with ogonek
1040						$chars{0xd345}=0x0118; # capital e with ogonek
1041						$chars{0xd349}=0x012e; # capital i with ogonek
1042						$chars{0xd34f}=0x01ea; # capital o with ogonek
1043						$chars{0xd355}=0x0172; # capital u with ogonek
1044						$chars{0xd361}=0x0105; # small a with ogonek
1045						$chars{0xd365}=0x0119; # small e with ogonek
1046						$chars{0xd369}=0x012f; # small i with ogonek
1047						$chars{0xd36f}=0x01eb; # small o with ogonek
1048						$chars{0xd375}=0x0173; # small u with ogonek
1049						# 5/4 circle below
1050						$chars{0xd441}=0x1e00; # capital a with ring below
1051						$chars{0xd461}=0x1e01; # small a with ring below
1052						# 5/5 half circle below
1053						$chars{0xf948}=0x1e2a; # capital h with breve below
1054						$chars{0xf968}=0x1e2b; # small h with breve below
1055						# 5/6 dot below
1056						$chars{0xd641}=0x1ea0; # capital a with dot below
1057						$chars{0xd642}=0x1e04; # capital b with dot below
1058						$chars{0xd644}=0x1e0c; # capital d with dot below
1059						$chars{0xd645}=0x1eb8; # capital e with dot below
1060						$chars{0xd648}=0x1e24; # capital h with dot below
1061						$chars{0xd649}=0x1eca; # capital i with dot below
1062						$chars{0xd64b}=0x1e32; # capital k with dot below
1063						$chars{0xd64c}=0x1e36; # capital l with dot below
1064						$chars{0xd64d}=0x1e42; # capital m with dot below
1065						$chars{0xd64e}=0x1e46; # capital n with dot below
1066						$chars{0xd64f}=0x1ecc; # capital o with dot below
1067						$chars{0xd652}=0x1e5a; # capital r with dot below
1068						$chars{0xd653}=0x1e62; # capital s with dot below
1069						$chars{0xd654}=0x1e6c; # capital t with dot below
1070						$chars{0xd655}=0x1ee4; # capital u with dot below
1071						$chars{0xd656}=0x1e7e; # capital v with dot below
1072						$chars{0xd657}=0x1e88; # capital w with dot below
1073						$chars{0xd659}=0x1ef4; # capital y with dot below
1074						$chars{0xd65a}=0x1e92; # capital z with dot below
1075						$chars{0xd661}=0x1ea1; # small a with dot below
1076						$chars{0xd662}=0x1e05; # small b with dot below
1077						$chars{0xd664}=0x1e0d; # small d with dot below
1078						$chars{0xd665}=0x1eb9; # small e with dot below
1079						$chars{0xd668}=0x1e25; # small h with dot below
1080						$chars{0xd669}=0x1ecb; # small i with dot below
1081						$chars{0xd66b}=0x1e33; # small k with dot below
1082						$chars{0xd66c}=0x1e37; # small l with dot below
1083						$chars{0xd66d}=0x1e43; # small m with dot below
1084						$chars{0xd66e}=0x1e47; # small n with dot below
1085						$chars{0xd66f}=0x1ecd; # small o with dot below
1086						$chars{0xd672}=0x1e5b; # small r with dot below
1087						$chars{0xd673}=0x1e63; # small s with dot below
1088						$chars{0xd674}=0x1e6d; # small t with dot below
1089						$chars{0xd675}=0x1ee5; # small u with dot below
1090						$chars{0xd676}=0x1e7f; # small v with dot below
1091						$chars{0xd677}=0x1e89; # small w with dot below
1092						$chars{0xd679}=0x1ef5; # small y with dot below
1093						$chars{0xd67a}=0x1e93; # small z with dot below
1094						# 5/7 double dot below
1095						$chars{0xd755}=0x1e72; # capital u with diaeresis below
1096						$chars{0xd775}=0x1e73; # small u with diaeresis below
1097						# 5/8 underline
1098						$chars{0xd820}=0x005f; # underline
1099						# 5/9 double underline
1100						$chars{0xd920}=0x2017; # double underline
1101						# 5/10 small low vertical bar
1102						$chars{0xda20}=0x02cc; #
1103						# 5/11 circumflex below
1104						# 5/12 (this position shall not be used)
1105						# 5/13 left half of ligature sign and of double tilde
1106						# 5/14 right half of ligature sign
1107						# 5/15 right half of double tilde
1108						# map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1109
1110						sub char_decode5426 {
1111	0			0		my ( $string) = @_;
1112	0					my $result;
1113
1114	0					my @data = unpack("C*", $string);
1115	0					my @characters;
1116	0					my $length=scalar(@data);
1117						for (my $i = 0; $i < scalar(@data); $i++) {
1118	0					my $char= $data[$i];
1119	0	0 0	0 0			if ($char >= 0x00 && $char <= 0x7F){
1120						#IsAscii
1121
1122	0	0	0			push @characters,$char unless ($char<0x02 \|\|$char== 0x0F);
1123						}elsif (($char >= 0xC0 && $char <= 0xDF)) {
1124						#Combined Char
1125	0					my $convchar ;
1126	0	0 0				if ($chars{$char*256+$data[$i+1]}) {
1127	0					$convchar= $chars{$char * 256 + $data[$i+1]};
1128	0					$i++;
1129						# printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1130						} elsif ($chars{$char}) {
1131	0					$convchar= $chars{$char};
1132						# printf "0xC char %x, converted %x\n",$char,$chars{$char};
1133						}else {
1134	0					$convchar=$char;
1135						}
1136	0					push @characters,$convchar;
1137						} else {
1138	0					my $convchar;
1139	0	0				if ($chars{$char}) {
1140	0					$convchar= $chars{$char};
1141						# printf "char %x, converted %x\n",$char,$chars{$char};
1142						}else {
1143						# printf "char %x $char\n",$char;
1144	0					$convchar=$char;
1145						}
1146	0					push @characters,$convchar;
1147						}
1148	0					}
1149	0					$result=pack "U*",@characters;
1150						# $result=~s/\x01//;
1151						# $result=~s/\x00//;
1152	0					$result=~s/\x0f//;
1153	0					$result=~s/\x1b.//;
1154	0					$result=~s/\x0e//;
1155	0					$result=~s/\x1b\x5b//;
1156						# map{printf "%x",$_} @characters;
1157						# printf "\n";
1158	0					return $result;
1159						}
1160
1161						1;
1162
1163
1164 - 1170						=head1 AUTHOR Koha Development Team <http://koha-community.org/> Galen Charlton <galen.charlton@liblime.com> =cut