From 3b72c6063c5536ab64f6d9f273c1421d524e5c8b Mon Sep 17 00:00:00 2001 From: notmasteryet Date: Sat, 10 Dec 2011 17:24:54 -0600 Subject: [PATCH 1/9] Text char codes extraction --- src/core.js | 50 +++++++++++++++++++++++++++-- src/evaluator.js | 82 ++++++++++++++++++++++++++++++++++++++++++------ src/worker.js | 22 +++++++++++++ web/viewer.js | 11 +++++++ 4 files changed, 152 insertions(+), 13 deletions(-) diff --git a/src/core.js b/src/core.js index 9f3e6b837..6a90357e8 100644 --- a/src/core.js +++ b/src/core.js @@ -200,10 +200,12 @@ var Page = (function PageClosure() { if (isArray(content)) { // fetching items var i, n = content.length; + var streams = []; for (i = 0; i < n; ++i) - content[i] = xref.fetchIfRef(content[i]); - content = new StreamsSequenceStream(content); - } + streams.push(xref.fetchIfRef(content[i])); + content = new StreamsSequenceStream(streams); + } else if (isStream(content)) + content.pos = 0; var pe = this.pe = new PartialEvaluator( xref, handler, 'p' + this.pageNumber + '_'); @@ -212,6 +214,36 @@ var Page = (function PageClosure() { dependency)); }, + extractTextContent: function pageExtractPageContent() { + if ('textContent' in this) { + // text content was extracted + return this.textContent; + } + + var handler = { + on: function () {}, + send: function() {} + }; + + var xref = this.xref; + var content = xref.fetchIfRef(this.content); + var resources = xref.fetchIfRef(this.resources); + if (isArray(content)) { + // fetching items + var i, n = content.length; + var streams = []; + for (i = 0; i < n; ++i) + streams.push(xref.fetchIfRef(content[i])); + content = new StreamsSequenceStream(streams); + } else if (isStream(content)) + content.pos = 0; + + var pe = new PartialEvaluator( + xref, handler, 'p' + this.pageNumber + '_'); + var text = pe.getTextContent(content, resources); + return (this.textContent = text); + }, + ensureFonts: function pageEnsureFonts(fonts, callback) { // Convert the font names to the corresponding font obj. for (var i = 0, ii = fonts.length; i < ii; i++) { @@ -614,6 +646,12 @@ var PDFDoc = (function PDFDocClosure() { throw data.error; }, this); + messageHandler.on('text_extracted', function pdfDocError(data) { + var index = data.index; + if (this.textExtracted) + this.textExtracted(index); + }, this); + setTimeout(function pdfDocFontReadySetTimeout() { messageHandler.send('doc', this.data); this.workerReadyPromise.resolve(true); @@ -643,6 +681,12 @@ var PDFDoc = (function PDFDocClosure() { return (this.pageCache[n] = page); }, + extractText: function pdfDocExtractExtractText() { + this.workerReadyPromise.then(function pdfDocStartRenderingThen() { + this.messageHandler.send('extract_text'); + }.bind(this)); + }, + destroy: function pdfDocDestroy() { if (this.worker) this.worker.terminate(); diff --git a/src/evaluator.js b/src/evaluator.js index a5ca627c5..588da5084 100644 --- a/src/evaluator.js +++ b/src/evaluator.js @@ -144,7 +144,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { fontRef = fontRef || fontRes.get(fontName); var font = xref.fetchIfRef(fontRef); assertWellFormed(isDict(font)); - if (!font.translated) { + if (!font.loadedName) { font.translated = self.translateFont(font, xref, resources, dependency); if (font.translated) { @@ -464,6 +464,65 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { }; }, + getTextContent: function partialEvaluatorGetIRQueue(stream, resources) { + + var self = this; + var xref = this.xref; + + function handleSetFont(fontName, fontRef) { + var fontRes = resources.get('Font'); + + // TODO: TOASK: Is it possible to get here? If so, what does + // args[0].name should be like??? + assert(fontRes, 'fontRes not available'); + + fontRes = xref.fetchIfRef(fontRes); + fontRef = fontRef || fontRes.get(fontName); + var font = xref.fetchIfRef(fontRef), tra; + assertWellFormed(isDict(font)); + if (!font.translated) { + font.translated = self.translateFont(font, xref, resources); + } + return font; + } + + resources = xref.fetchIfRef(resources) || new Dict(); + + var parser = new Parser(new Lexer(stream), false); + var res = resources; + var args = [], obj; + + var text = ''; + var font = null; + while (!isEOF(obj = parser.getObj())) { + if (isCmd(obj)) { + var cmd = obj.cmd; + switch (cmd) { + case 'Tf': + font = handleSetFont(args[0].name); + break; + case 'TJ': + var items = args[0]; + for (var j = 0, jj = items.length; j < jj; j++) { + if (typeof items[j] === 'string') + text += items[j]; + } + break; + case 'Tj': + text += args[0]; + break; + } // switch + + args = []; + } else if (obj != null) { + assertWellFormed(args.length <= 33, 'Too many arguments'); + args.push(obj); + } + } + + return text; + }, + extractDataStructures: function partialEvaluatorExtractDataStructures(dict, baseDict, xref, properties) { @@ -837,15 +896,18 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { if (type.name === 'Type3') { properties.coded = true; - var charProcs = xref.fetchIfRef(dict.get('CharProcs')); - var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources; - properties.resources = fontResources; - properties.charProcIRQueues = {}; - for (var key in charProcs.map) { - var glyphStream = xref.fetchIfRef(charProcs.map[key]); - var queueObj = {}; - properties.charProcIRQueues[key] = - this.getIRQueue(glyphStream, fontResources, queueObj, dependency); + // read char procs only if dependency is specified + if (dependency) { + var charProcs = xref.fetchIfRef(dict.get('CharProcs')); + var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources; + properties.resources = fontResources; + properties.charProcIRQueues = {}; + for (var key in charProcs.map) { + var glyphStream = xref.fetchIfRef(charProcs.map[key]); + var queueObj = {}; + properties.charProcIRQueues[key] = + this.getIRQueue(glyphStream, fontResources, queueObj, dependency); + } } } diff --git a/src/worker.js b/src/worker.js index 52e631c92..c3992e54f 100644 --- a/src/worker.js +++ b/src/worker.js @@ -160,6 +160,28 @@ var WorkerMessageHandler = { handler.send('font_ready', [objId, obj]); }); + + handler.on('extract_text', function wphExtractText() { + var numPages = pdfDoc.numPages; + var index = []; + for (var i = 0; i < numPages; i++) { + var start = Date.now(); + + var textContent = ''; + try { + var page = pdfDoc.getPage(i + 1); + textContent = page.extractTextContent(); + } catch (e) { + // Skip errored pages + } + + index.push(textContent); + } + + console.log('text indexing=: time=%dms', Date.now() - start); + + handler.send('text_extracted', { index: index }); + }); } }; diff --git a/web/viewer.js b/web/viewer.js index bdcac09d5..465df5ab5 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -309,6 +309,17 @@ var PDFView = { } else this.page = 1; + + setTimeout((function loadStartTextExtraction() { + this.startTextExtraction(pdf); + }).bind(this), 500); + }, + + startTextExtraction: function(pdf) { + pdf.textExtracted = function pdfTextExtracted(index) { + console.log(index.join()); + }; + pdf.extractText(); }, setHash: function pdfViewSetHash(hash) { From 94cc2cdb75b310daff18fd84745bb13e328f51f5 Mon Sep 17 00:00:00 2001 From: notmasteryet Date: Sun, 11 Dec 2011 17:59:19 -0600 Subject: [PATCH 2/9] Char code to unicode conversion --- src/evaluator.js | 4 +-- src/fonts.js | 74 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 18 deletions(-) diff --git a/src/evaluator.js b/src/evaluator.js index 588da5084..553a04364 100644 --- a/src/evaluator.js +++ b/src/evaluator.js @@ -505,11 +505,11 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { var items = args[0]; for (var j = 0, jj = items.length; j < jj; j++) { if (typeof items[j] === 'string') - text += items[j]; + text += fontCharsToUnicode(items[j], font.translated.properties); } break; case 'Tj': - text += args[0]; + text += fontCharsToUnicode(args[0], font.translated.properties);; break; } // switch diff --git a/src/fonts.js b/src/fonts.js index 8609ae608..3c65a1a07 100644 --- a/src/fonts.js +++ b/src/fonts.js @@ -723,6 +723,48 @@ function isSpecialUnicode(unicode) { unicode < kCmapGlyphOffset + kSizeOfGlyphArea); } +function fontCharsToUnicode(charCodes, fontProperties) { + var toUnicode = fontProperties.toUnicode; + var composite = fontProperties.composite; + var encoding, differences, cidToUnicode; + var result = ''; + if (composite) { + cidToUnicode = fontProperties.cidToUnicode + for (var i = 0, ii = charCodes.length; i < ii; i += 2) { + var charCode = (charCodes.charCodeAt(i) << 8) | charCodes.charCodeAt(i + 1); + if (toUnicode && charCode in toUnicode) { + var unicode = toUnicode[charCode]; + result += typeof unicode !== 'number' ? unicode : + String.fromCharCode(unicode); + continue; + } + result += String.fromCharCode(!cidToUnicode ? charCode : + cidToUnicode[charCode] || charCode) + } + } else { + differences = fontProperties.differences; + encoding = fontProperties.baseEncoding; + for (var i = 0, ii = charCodes.length; i < ii; i++) { + var charCode = charCodes.charCodeAt(i); + if (toUnicode && charCode in toUnicode) { + var unicode = toUnicode[charCode]; + result += typeof unicode !== 'number' ? unicode : + String.fromCharCode(unicode); + continue; + } + + var glyphName = charCode in differences ? differences[charCode] : + encoding[charCode]; + if (glyphName in GlyphsUnicode) { + result += String.fromCharCode(GlyphsUnicode[glyphName]); + continue; + } + result += String.fromCharCode(charCode) + } + } + return result; +} + /** * 'Font' is the class the outside world should use, it encapsulate all the font * decoding logics whatever type it is (assuming the font type is supported). @@ -2098,7 +2140,7 @@ var Font = (function FontClosure() { }, charToGlyph: function fonts_charToGlyph(charcode) { - var unicode, width, codeIRQueue; + var fontChar, width, codeIRQueue; var width = this.widths[charcode]; @@ -2106,38 +2148,38 @@ var Font = (function FontClosure() { case 'CIDFontType0': if (this.noUnicodeAdaptation) { width = this.widths[this.unicodeToCID[charcode] || charcode]; - unicode = charcode; + fontChar = charcode; break; } - unicode = this.toUnicode[charcode] || charcode; + fontChar = this.toUnicode[charcode] || charcode; break; case 'CIDFontType2': if (this.noUnicodeAdaptation) { width = this.widths[this.unicodeToCID[charcode] || charcode]; - unicode = charcode; + fontChar = charcode; break; } - unicode = this.toUnicode[charcode] || charcode; + fontChar = this.toUnicode[charcode] || charcode; break; case 'Type1': var glyphName = this.differences[charcode] || this.encoding[charcode]; if (!isNum(width)) width = this.widths[glyphName]; if (this.noUnicodeAdaptation) { - unicode = GlyphsUnicode[glyphName] || charcode; + fontChar = GlyphsUnicode[glyphName] || charcode; break; } - unicode = this.glyphNameMap[glyphName] || + fontChar = this.glyphNameMap[glyphName] || GlyphsUnicode[glyphName] || charcode; break; case 'Type3': var glyphName = this.differences[charcode] || this.encoding[charcode]; codeIRQueue = this.charProcIRQueues[glyphName]; - unicode = charcode; + fontChar = charcode; break; case 'TrueType': if (this.useToUnicode) { - unicode = this.toUnicode[charcode] || charcode; + fontChar = this.toUnicode[charcode] || charcode; break; } var glyphName = this.differences[charcode] || this.encoding[charcode]; @@ -2146,19 +2188,19 @@ var Font = (function FontClosure() { if (!isNum(width)) width = this.widths[glyphName]; if (this.noUnicodeAdaptation) { - unicode = GlyphsUnicode[glyphName] || charcode; + fontChar = GlyphsUnicode[glyphName] || charcode; break; } if (!this.hasEncoding) { - unicode = this.useToUnicode ? this.toUnicode[charcode] : charcode; + fontChar = this.useToUnicode ? this.toUnicode[charcode] : charcode; break; } if (this.hasShortCmap && false) { var j = Encodings.MacRomanEncoding.indexOf(glyphName); - unicode = j >= 0 ? j : + fontChar = j >= 0 ? j : this.glyphNameMap[glyphName]; } else { - unicode = glyphName in GlyphsUnicode ? + fontChar = glyphName in GlyphsUnicode ? GlyphsUnicode[glyphName] : this.glyphNameMap[glyphName]; } @@ -2168,15 +2210,15 @@ var Font = (function FontClosure() { break; } - var unicodeChars = !('toUnicode' in this) ? charcode : - this.toUnicode[charcode] || charcode; + var unicodeChars = !('toUnicode' in this) ? fontChar : + this.toUnicode[charcode] || fontChar; if (typeof unicodeChars === 'number') unicodeChars = String.fromCharCode(unicodeChars); width = (isNum(width) ? width : this.defaultWidth) * this.widthMultiplier; return { - fontChar: String.fromCharCode(unicode), + fontChar: String.fromCharCode(fontChar), unicode: unicodeChars, width: width, codeIRQueue: codeIRQueue From 4d44eb61841771a5d5d0032a45a4dd70e862e197 Mon Sep 17 00:00:00 2001 From: notmasteryet Date: Sun, 11 Dec 2011 18:14:52 -0600 Subject: [PATCH 3/9] fix stream reset, interrupting thread when pages are indexed --- src/core.js | 4 ++-- src/worker.js | 23 +++++++++++++++++------ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/core.js b/src/core.js index 6a90357e8..6a932f127 100644 --- a/src/core.js +++ b/src/core.js @@ -205,7 +205,7 @@ var Page = (function PageClosure() { streams.push(xref.fetchIfRef(content[i])); content = new StreamsSequenceStream(streams); } else if (isStream(content)) - content.pos = 0; + content.reset(); var pe = this.pe = new PartialEvaluator( xref, handler, 'p' + this.pageNumber + '_'); @@ -236,7 +236,7 @@ var Page = (function PageClosure() { streams.push(xref.fetchIfRef(content[i])); content = new StreamsSequenceStream(streams); } else if (isStream(content)) - content.pos = 0; + content.reset(); var pe = new PartialEvaluator( xref, handler, 'p' + this.pageNumber + '_'); diff --git a/src/worker.js b/src/worker.js index c3992e54f..3cc91d07e 100644 --- a/src/worker.js +++ b/src/worker.js @@ -164,23 +164,34 @@ var WorkerMessageHandler = { handler.on('extract_text', function wphExtractText() { var numPages = pdfDoc.numPages; var index = []; - for (var i = 0; i < numPages; i++) { - var start = Date.now(); + var start = Date.now(); + + function indexPage(pageNum) { + if (pageNum > numPages) { + console.log('text indexing=: time=%dms', Date.now() - start); + + handler.send('text_extracted', { index: index }); + return; + } var textContent = ''; try { - var page = pdfDoc.getPage(i + 1); + var page = pdfDoc.getPage(pageNum); textContent = page.extractTextContent(); } catch (e) { // Skip errored pages } index.push(textContent); + + // processing one page, interrupting thread to process + // other requests + setTimeout(function extractTextNextPage() { + indexPage(pageNum + 1); + }, 0); } - console.log('text indexing=: time=%dms', Date.now() - start); - - handler.send('text_extracted', { index: index }); + indexPage(1); }); } }; From fb2d165a488e34a646412615a7cc2aa2e8e82af1 Mon Sep 17 00:00:00 2001 From: notmasteryet Date: Sun, 11 Dec 2011 19:38:20 -0600 Subject: [PATCH 4/9] Simple search results --- web/viewer.css | 40 +++++++++++++++++++- web/viewer.html | 10 +++++ web/viewer.js | 98 ++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 130 insertions(+), 18 deletions(-) diff --git a/web/viewer.css b/web/viewer.css index a1ef92810..56d513f70 100644 --- a/web/viewer.css +++ b/web/viewer.css @@ -172,9 +172,47 @@ span#info { box-shadow: 0px 2px 10px #ff0; } +#searchScrollView { + position: absolute; + top: 10px; + bottom: 10px; + left: 10px; + width: 280px; +} + +#searchToolbar { + padding-left: 0px; + right: 0px; + padding-top: 0px; + padding-bottom: 5px; +} + +#searchToolbar > input { + width: 220px; +} + +#searchResults { + overflow: auto; + background-color: #fff; + position: absolute; + top: 30px; + bottom: 0px; + left: 0px; + right: 0; + font-size: smaller; + opacity: 0.7; +} + +#searchResults a { + display: block; + white-space: pre; + text-decoration: none; + color: black; +} + #sidebarControls { position:absolute; - width: 120px; + width: 180px; height: 32px; left: 15px; bottom: 35px; diff --git a/web/viewer.html b/web/viewer.html index 53ca2a247..98c19a53c 100644 --- a/web/viewer.html +++ b/web/viewer.html @@ -124,6 +124,13 @@ +
+
diff --git a/web/viewer.js b/web/viewer.js index 465df5ab5..340586a94 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -310,18 +310,69 @@ var PDFView = { else this.page = 1; + // loosing pdf reference here, starting text indexing in 500ms setTimeout((function loadStartTextExtraction() { this.startTextExtraction(pdf); }).bind(this), 500); + delete PDFView.extractedText; }, - startTextExtraction: function(pdf) { + startTextExtraction: function pdfViewStartTextExtraction(pdf) { + var searchResults = document.getElementById('searchResults'); + searchResults.textContent = ''; + pdf.textExtracted = function pdfTextExtracted(index) { - console.log(index.join()); + PDFView.extractedText = index; }; pdf.extractText(); }, + search: function pdfViewStartSearch() { + function bindLink(link, pageNumber) { + link.href = '#' + pageNumber; + link.onclick = function searchBindLink() { + PDFView.page = pageNumber; + return false; + }; + } + + var searchResults = document.getElementById('searchResults'); + if (!('extractedText' in PDFView)) { + // not indexed yet, repeat in 1 second + searchResults.textContent = 'Searching...'; + setTimeout(this.search.bind(this), 1000); + return; + } + + var searchTermsInput = document.getElementById('searchTermsInput'); + searchResults.removeAttribute('hidden'); + searchResults.textContent = ''; + + var terms = searchTermsInput.value; + // simple search: removing spaces and hyphens, then scanning every + terms = terms.replace(/\s-/g, '').toLowerCase(); + var index = PDFView.extractedText; + var pageFound = false; + for (var i = 0, ii = index.length; i < ii; i++) { + var pageText = index[i].replace(/\s-/g, '').toLowerCase(); + var j = pageText.indexOf(terms); + if (j < 0) + continue; + + var pageNumber = i + 1; + var textSample = index[i].substr(j, 50); + var link = document.createElement('a'); + bindLink(link, pageNumber); + link.textContent = 'Page ' + pageNumber + ': ' + textSample; + searchResults.appendChild(link); + + pageFound = true; + } + if (!pageFound) { + searchResults.textContent = '(Not found)'; + } + }, + setHash: function pdfViewSetHash(hash) { if (!hash) return; @@ -361,23 +412,36 @@ var PDFView = { switchSidebarView: function pdfViewSwitchSidebarView(view) { var thumbsScrollView = document.getElementById('sidebarScrollView'); - var outlineScrollView = document.getElementById('outlineScrollView'); var thumbsSwitchButton = document.getElementById('thumbsSwitch'); + if (view == 'thumbs') { + thumbsScrollView.removeAttribute('hidden'); + thumbsSwitchButton.setAttribute('data-selected', true); + } else { + thumbsScrollView.setAttribute('hidden', 'true'); + thumbsSwitchButton.removeAttribute('data-selected'); + } + + var outlineScrollView = document.getElementById('outlineScrollView'); var outlineSwitchButton = document.getElementById('outlineSwitch'); - switch (view) { - case 'thumbs': - thumbsScrollView.removeAttribute('hidden'); - outlineScrollView.setAttribute('hidden', 'true'); - thumbsSwitchButton.setAttribute('data-selected', true); - outlineSwitchButton.removeAttribute('data-selected'); - updateThumbViewArea(); - break; - case 'outline': - thumbsScrollView.setAttribute('hidden', 'true'); - outlineScrollView.removeAttribute('hidden'); - thumbsSwitchButton.removeAttribute('data-selected'); - outlineSwitchButton.setAttribute('data-selected', true); - break; + if (view == 'outline') { + outlineScrollView.removeAttribute('hidden'); + outlineSwitchButton.setAttribute('data-selected', true); + } else { + outlineScrollView.setAttribute('hidden', 'true'); + outlineSwitchButton.removeAttribute('data-selected'); + } + + var searchScrollView = document.getElementById('searchScrollView'); + var searchSwitchButton = document.getElementById('searchSwitch'); + if (view == 'search') { + searchScrollView.removeAttribute('hidden'); + searchSwitchButton.setAttribute('data-selected', true); + + var searchTermsInput = document.getElementById('searchTermsInput'); + searchTermsInput.focus(); + } else { + searchScrollView.setAttribute('hidden', 'true'); + searchSwitchButton.removeAttribute('data-selected'); } }, From c85ec052b1add9b1224954c65c307afa7b90812e Mon Sep 17 00:00:00 2001 From: notmasteryet Date: Wed, 14 Dec 2011 21:42:06 -0600 Subject: [PATCH 5/9] Unicode normalization; lint warnings --- src/core.js | 4 +- src/evaluator.js | 15 +- src/fonts.js | 699 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 707 insertions(+), 11 deletions(-) diff --git a/src/core.js b/src/core.js index b498401d1..633a36c1d 100644 --- a/src/core.js +++ b/src/core.js @@ -221,8 +221,8 @@ var Page = (function PageClosure() { } var handler = { - on: function () {}, - send: function() {} + on: function nullHandlerOn() {}, + send: function nullHandlerSend() {} }; var xref = this.xref; diff --git a/src/evaluator.js b/src/evaluator.js index 7e1dd3083..04556c717 100644 --- a/src/evaluator.js +++ b/src/evaluator.js @@ -494,12 +494,18 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { case 'TJ': var items = args[0]; for (var j = 0, jj = items.length; j < jj; j++) { - if (typeof items[j] === 'string') - text += fontCharsToUnicode(items[j], font.translated.properties); + if (typeof items[j] === 'string') { + text += fontCharsToUnicode(items[j], + font.translated.properties); + } else if (items[j] < 0) { + // making all negative offsets a space - better to have + // a space in incorrect place than not have them at all + text += ' '; + } } break; case 'Tj': - text += fontCharsToUnicode(args[0], font.translated.properties);; + text += fontCharsToUnicode(args[0], font.translated.properties); break; } // switch @@ -889,7 +895,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { // read char procs only if dependency is specified if (dependency) { var charProcs = xref.fetchIfRef(dict.get('CharProcs')); - var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources; + var fontResources = xref.fetchIfRef(dict.get('Resources')) || + resources; properties.resources = fontResources; properties.charProcIRQueues = {}; for (var key in charProcs.map) { diff --git a/src/fonts.js b/src/fonts.js index 6bbbaf014..68fe9fb59 100644 --- a/src/fonts.js +++ b/src/fonts.js @@ -723,15 +723,694 @@ function isSpecialUnicode(unicode) { unicode < kCmapGlyphOffset + kSizeOfGlyphArea); } +// The normalization table is obtained by filtering the Unicode characters +// database with entries. +var NormalizedUnicodes = { + '\u00A8': '\u0020\u0308', + '\u00AF': '\u0020\u0304', + '\u00B4': '\u0020\u0301', + '\u00B5': '\u03BC', + '\u00B8': '\u0020\u0327', + '\u0132': '\u0049\u004A', + '\u0133': '\u0069\u006A', + '\u013F': '\u004C\u00B7', + '\u0140': '\u006C\u00B7', + '\u0149': '\u02BC\u006E', + '\u017F': '\u0073', + '\u01C4': '\u0044\u017D', + '\u01C5': '\u0044\u017E', + '\u01C6': '\u0064\u017E', + '\u01C7': '\u004C\u004A', + '\u01C8': '\u004C\u006A', + '\u01C9': '\u006C\u006A', + '\u01CA': '\u004E\u004A', + '\u01CB': '\u004E\u006A', + '\u01CC': '\u006E\u006A', + '\u01F1': '\u0044\u005A', + '\u01F2': '\u0044\u007A', + '\u01F3': '\u0064\u007A', + '\u02D8': '\u0020\u0306', + '\u02D9': '\u0020\u0307', + '\u02DA': '\u0020\u030A', + '\u02DB': '\u0020\u0328', + '\u02DC': '\u0020\u0303', + '\u02DD': '\u0020\u030B', + '\u037A': '\u0020\u0345', + '\u0384': '\u0020\u0301', + '\u03D0': '\u03B2', + '\u03D1': '\u03B8', + '\u03D2': '\u03A5', + '\u03D5': '\u03C6', + '\u03D6': '\u03C0', + '\u03F0': '\u03BA', + '\u03F1': '\u03C1', + '\u03F2': '\u03C2', + '\u03F4': '\u0398', + '\u03F5': '\u03B5', + '\u03F9': '\u03A3', + '\u0587': '\u0565\u0582', + '\u0675': '\u0627\u0674', + '\u0676': '\u0648\u0674', + '\u0677': '\u06C7\u0674', + '\u0678': '\u064A\u0674', + '\u0E33': '\u0E4D\u0E32', + '\u0EB3': '\u0ECD\u0EB2', + '\u0EDC': '\u0EAB\u0E99', + '\u0EDD': '\u0EAB\u0EA1', + '\u0F77': '\u0FB2\u0F81', + '\u0F79': '\u0FB3\u0F81', + '\u1E9A': '\u0061\u02BE', + '\u1FBD': '\u0020\u0313', + '\u1FBF': '\u0020\u0313', + '\u1FC0': '\u0020\u0342', + '\u1FFE': '\u0020\u0314', + '\u2002': '\u0020', + '\u2003': '\u0020', + '\u2004': '\u0020', + '\u2005': '\u0020', + '\u2006': '\u0020', + '\u2008': '\u0020', + '\u2009': '\u0020', + '\u200A': '\u0020', + '\u2017': '\u0020\u0333', + '\u2024': '\u002E', + '\u2025': '\u002E\u002E', + '\u2026': '\u002E\u002E\u002E', + '\u2033': '\u2032\u2032', + '\u2034': '\u2032\u2032\u2032', + '\u2036': '\u2035\u2035', + '\u2037': '\u2035\u2035\u2035', + '\u203C': '\u0021\u0021', + '\u203E': '\u0020\u0305', + '\u2047': '\u003F\u003F', + '\u2048': '\u003F\u0021', + '\u2049': '\u0021\u003F', + '\u2057': '\u2032\u2032\u2032\u2032', + '\u205F': '\u0020', + '\u20A8': '\u0052\u0073', + '\u2100': '\u0061\u002F\u0063', + '\u2101': '\u0061\u002F\u0073', + '\u2103': '\u00B0\u0043', + '\u2105': '\u0063\u002F\u006F', + '\u2106': '\u0063\u002F\u0075', + '\u2107': '\u0190', + '\u2109': '\u00B0\u0046', + '\u2116': '\u004E\u006F', + '\u2121': '\u0054\u0045\u004C', + '\u2135': '\u05D0', + '\u2136': '\u05D1', + '\u2137': '\u05D2', + '\u2138': '\u05D3', + '\u213B': '\u0046\u0041\u0058', + '\u2160': '\u0049', + '\u2161': '\u0049\u0049', + '\u2162': '\u0049\u0049\u0049', + '\u2163': '\u0049\u0056', + '\u2164': '\u0056', + '\u2165': '\u0056\u0049', + '\u2166': '\u0056\u0049\u0049', + '\u2167': '\u0056\u0049\u0049\u0049', + '\u2168': '\u0049\u0058', + '\u2169': '\u0058', + '\u216A': '\u0058\u0049', + '\u216B': '\u0058\u0049\u0049', + '\u216C': '\u004C', + '\u216D': '\u0043', + '\u216E': '\u0044', + '\u216F': '\u004D', + '\u2170': '\u0069', + '\u2171': '\u0069\u0069', + '\u2172': '\u0069\u0069\u0069', + '\u2173': '\u0069\u0076', + '\u2174': '\u0076', + '\u2175': '\u0076\u0069', + '\u2176': '\u0076\u0069\u0069', + '\u2177': '\u0076\u0069\u0069\u0069', + '\u2178': '\u0069\u0078', + '\u2179': '\u0078', + '\u217A': '\u0078\u0069', + '\u217B': '\u0078\u0069\u0069', + '\u217C': '\u006C', + '\u217D': '\u0063', + '\u217E': '\u0064', + '\u217F': '\u006D', + '\u222C': '\u222B\u222B', + '\u222D': '\u222B\u222B\u222B', + '\u222F': '\u222E\u222E', + '\u2230': '\u222E\u222E\u222E', + '\u2474': '\u0028\u0031\u0029', + '\u2475': '\u0028\u0032\u0029', + '\u2476': '\u0028\u0033\u0029', + '\u2477': '\u0028\u0034\u0029', + '\u2478': '\u0028\u0035\u0029', + '\u2479': '\u0028\u0036\u0029', + '\u247A': '\u0028\u0037\u0029', + '\u247B': '\u0028\u0038\u0029', + '\u247C': '\u0028\u0039\u0029', + '\u247D': '\u0028\u0031\u0030\u0029', + '\u247E': '\u0028\u0031\u0031\u0029', + '\u247F': '\u0028\u0031\u0032\u0029', + '\u2480': '\u0028\u0031\u0033\u0029', + '\u2481': '\u0028\u0031\u0034\u0029', + '\u2482': '\u0028\u0031\u0035\u0029', + '\u2483': '\u0028\u0031\u0036\u0029', + '\u2484': '\u0028\u0031\u0037\u0029', + '\u2485': '\u0028\u0031\u0038\u0029', + '\u2486': '\u0028\u0031\u0039\u0029', + '\u2487': '\u0028\u0032\u0030\u0029', + '\u2488': '\u0031\u002E', + '\u2489': '\u0032\u002E', + '\u248A': '\u0033\u002E', + '\u248B': '\u0034\u002E', + '\u248C': '\u0035\u002E', + '\u248D': '\u0036\u002E', + '\u248E': '\u0037\u002E', + '\u248F': '\u0038\u002E', + '\u2490': '\u0039\u002E', + '\u2491': '\u0031\u0030\u002E', + '\u2492': '\u0031\u0031\u002E', + '\u2493': '\u0031\u0032\u002E', + '\u2494': '\u0031\u0033\u002E', + '\u2495': '\u0031\u0034\u002E', + '\u2496': '\u0031\u0035\u002E', + '\u2497': '\u0031\u0036\u002E', + '\u2498': '\u0031\u0037\u002E', + '\u2499': '\u0031\u0038\u002E', + '\u249A': '\u0031\u0039\u002E', + '\u249B': '\u0032\u0030\u002E', + '\u249C': '\u0028\u0061\u0029', + '\u249D': '\u0028\u0062\u0029', + '\u249E': '\u0028\u0063\u0029', + '\u249F': '\u0028\u0064\u0029', + '\u24A0': '\u0028\u0065\u0029', + '\u24A1': '\u0028\u0066\u0029', + '\u24A2': '\u0028\u0067\u0029', + '\u24A3': '\u0028\u0068\u0029', + '\u24A4': '\u0028\u0069\u0029', + '\u24A5': '\u0028\u006A\u0029', + '\u24A6': '\u0028\u006B\u0029', + '\u24A7': '\u0028\u006C\u0029', + '\u24A8': '\u0028\u006D\u0029', + '\u24A9': '\u0028\u006E\u0029', + '\u24AA': '\u0028\u006F\u0029', + '\u24AB': '\u0028\u0070\u0029', + '\u24AC': '\u0028\u0071\u0029', + '\u24AD': '\u0028\u0072\u0029', + '\u24AE': '\u0028\u0073\u0029', + '\u24AF': '\u0028\u0074\u0029', + '\u24B0': '\u0028\u0075\u0029', + '\u24B1': '\u0028\u0076\u0029', + '\u24B2': '\u0028\u0077\u0029', + '\u24B3': '\u0028\u0078\u0029', + '\u24B4': '\u0028\u0079\u0029', + '\u24B5': '\u0028\u007A\u0029', + '\u2A0C': '\u222B\u222B\u222B\u222B', + '\u2A74': '\u003A\u003A\u003D', + '\u2A75': '\u003D\u003D', + '\u2A76': '\u003D\u003D\u003D', + '\u2E9F': '\u6BCD', + '\u2EF3': '\u9F9F', + '\u2F00': '\u4E00', + '\u2F01': '\u4E28', + '\u2F02': '\u4E36', + '\u2F03': '\u4E3F', + '\u2F04': '\u4E59', + '\u2F05': '\u4E85', + '\u2F06': '\u4E8C', + '\u2F07': '\u4EA0', + '\u2F08': '\u4EBA', + '\u2F09': '\u513F', + '\u2F0A': '\u5165', + '\u2F0B': '\u516B', + '\u2F0C': '\u5182', + '\u2F0D': '\u5196', + '\u2F0E': '\u51AB', + '\u2F0F': '\u51E0', + '\u2F10': '\u51F5', + '\u2F11': '\u5200', + '\u2F12': '\u529B', + '\u2F13': '\u52F9', + '\u2F14': '\u5315', + '\u2F15': '\u531A', + '\u2F16': '\u5338', + '\u2F17': '\u5341', + '\u2F18': '\u535C', + '\u2F19': '\u5369', + '\u2F1A': '\u5382', + '\u2F1B': '\u53B6', + '\u2F1C': '\u53C8', + '\u2F1D': '\u53E3', + '\u2F1E': '\u56D7', + '\u2F1F': '\u571F', + '\u2F20': '\u58EB', + '\u2F21': '\u5902', + '\u2F22': '\u590A', + '\u2F23': '\u5915', + '\u2F24': '\u5927', + '\u2F25': '\u5973', + '\u2F26': '\u5B50', + '\u2F27': '\u5B80', + '\u2F28': '\u5BF8', + '\u2F29': '\u5C0F', + '\u2F2A': '\u5C22', + '\u2F2B': '\u5C38', + '\u2F2C': '\u5C6E', + '\u2F2D': '\u5C71', + '\u2F2E': '\u5DDB', + '\u2F2F': '\u5DE5', + '\u2F30': '\u5DF1', + '\u2F31': '\u5DFE', + '\u2F32': '\u5E72', + '\u2F33': '\u5E7A', + '\u2F34': '\u5E7F', + '\u2F35': '\u5EF4', + '\u2F36': '\u5EFE', + '\u2F37': '\u5F0B', + '\u2F38': '\u5F13', + '\u2F39': '\u5F50', + '\u2F3A': '\u5F61', + '\u2F3B': '\u5F73', + '\u2F3C': '\u5FC3', + '\u2F3D': '\u6208', + '\u2F3E': '\u6236', + '\u2F3F': '\u624B', + '\u2F40': '\u652F', + '\u2F41': '\u6534', + '\u2F42': '\u6587', + '\u2F43': '\u6597', + '\u2F44': '\u65A4', + '\u2F45': '\u65B9', + '\u2F46': '\u65E0', + '\u2F47': '\u65E5', + '\u2F48': '\u66F0', + '\u2F49': '\u6708', + '\u2F4A': '\u6728', + '\u2F4B': '\u6B20', + '\u2F4C': '\u6B62', + '\u2F4D': '\u6B79', + '\u2F4E': '\u6BB3', + '\u2F4F': '\u6BCB', + '\u2F50': '\u6BD4', + '\u2F51': '\u6BDB', + '\u2F52': '\u6C0F', + '\u2F53': '\u6C14', + '\u2F54': '\u6C34', + '\u2F55': '\u706B', + '\u2F56': '\u722A', + '\u2F57': '\u7236', + '\u2F58': '\u723B', + '\u2F59': '\u723F', + '\u2F5A': '\u7247', + '\u2F5B': '\u7259', + '\u2F5C': '\u725B', + '\u2F5D': '\u72AC', + '\u2F5E': '\u7384', + '\u2F5F': '\u7389', + '\u2F60': '\u74DC', + '\u2F61': '\u74E6', + '\u2F62': '\u7518', + '\u2F63': '\u751F', + '\u2F64': '\u7528', + '\u2F65': '\u7530', + '\u2F66': '\u758B', + '\u2F67': '\u7592', + '\u2F68': '\u7676', + '\u2F69': '\u767D', + '\u2F6A': '\u76AE', + '\u2F6B': '\u76BF', + '\u2F6C': '\u76EE', + '\u2F6D': '\u77DB', + '\u2F6E': '\u77E2', + '\u2F6F': '\u77F3', + '\u2F70': '\u793A', + '\u2F71': '\u79B8', + '\u2F72': '\u79BE', + '\u2F73': '\u7A74', + '\u2F74': '\u7ACB', + '\u2F75': '\u7AF9', + '\u2F76': '\u7C73', + '\u2F77': '\u7CF8', + '\u2F78': '\u7F36', + '\u2F79': '\u7F51', + '\u2F7A': '\u7F8A', + '\u2F7B': '\u7FBD', + '\u2F7C': '\u8001', + '\u2F7D': '\u800C', + '\u2F7E': '\u8012', + '\u2F7F': '\u8033', + '\u2F80': '\u807F', + '\u2F81': '\u8089', + '\u2F82': '\u81E3', + '\u2F83': '\u81EA', + '\u2F84': '\u81F3', + '\u2F85': '\u81FC', + '\u2F86': '\u820C', + '\u2F87': '\u821B', + '\u2F88': '\u821F', + '\u2F89': '\u826E', + '\u2F8A': '\u8272', + '\u2F8B': '\u8278', + '\u2F8C': '\u864D', + '\u2F8D': '\u866B', + '\u2F8E': '\u8840', + '\u2F8F': '\u884C', + '\u2F90': '\u8863', + '\u2F91': '\u897E', + '\u2F92': '\u898B', + '\u2F93': '\u89D2', + '\u2F94': '\u8A00', + '\u2F95': '\u8C37', + '\u2F96': '\u8C46', + '\u2F97': '\u8C55', + '\u2F98': '\u8C78', + '\u2F99': '\u8C9D', + '\u2F9A': '\u8D64', + '\u2F9B': '\u8D70', + '\u2F9C': '\u8DB3', + '\u2F9D': '\u8EAB', + '\u2F9E': '\u8ECA', + '\u2F9F': '\u8F9B', + '\u2FA0': '\u8FB0', + '\u2FA1': '\u8FB5', + '\u2FA2': '\u9091', + '\u2FA3': '\u9149', + '\u2FA4': '\u91C6', + '\u2FA5': '\u91CC', + '\u2FA6': '\u91D1', + '\u2FA7': '\u9577', + '\u2FA8': '\u9580', + '\u2FA9': '\u961C', + '\u2FAA': '\u96B6', + '\u2FAB': '\u96B9', + '\u2FAC': '\u96E8', + '\u2FAD': '\u9751', + '\u2FAE': '\u975E', + '\u2FAF': '\u9762', + '\u2FB0': '\u9769', + '\u2FB1': '\u97CB', + '\u2FB2': '\u97ED', + '\u2FB3': '\u97F3', + '\u2FB4': '\u9801', + '\u2FB5': '\u98A8', + '\u2FB6': '\u98DB', + '\u2FB7': '\u98DF', + '\u2FB8': '\u9996', + '\u2FB9': '\u9999', + '\u2FBA': '\u99AC', + '\u2FBB': '\u9AA8', + '\u2FBC': '\u9AD8', + '\u2FBD': '\u9ADF', + '\u2FBE': '\u9B25', + '\u2FBF': '\u9B2F', + '\u2FC0': '\u9B32', + '\u2FC1': '\u9B3C', + '\u2FC2': '\u9B5A', + '\u2FC3': '\u9CE5', + '\u2FC4': '\u9E75', + '\u2FC5': '\u9E7F', + '\u2FC6': '\u9EA5', + '\u2FC7': '\u9EBB', + '\u2FC8': '\u9EC3', + '\u2FC9': '\u9ECD', + '\u2FCA': '\u9ED1', + '\u2FCB': '\u9EF9', + '\u2FCC': '\u9EFD', + '\u2FCD': '\u9F0E', + '\u2FCE': '\u9F13', + '\u2FCF': '\u9F20', + '\u2FD0': '\u9F3B', + '\u2FD1': '\u9F4A', + '\u2FD2': '\u9F52', + '\u2FD3': '\u9F8D', + '\u2FD4': '\u9F9C', + '\u2FD5': '\u9FA0', + '\u3036': '\u3012', + '\u3038': '\u5341', + '\u3039': '\u5344', + '\u303A': '\u5345', + '\u309B': '\u0020\u3099', + '\u309C': '\u0020\u309A', + '\u3131': '\u1100', + '\u3132': '\u1101', + '\u3133': '\u11AA', + '\u3134': '\u1102', + '\u3135': '\u11AC', + '\u3136': '\u11AD', + '\u3137': '\u1103', + '\u3138': '\u1104', + '\u3139': '\u1105', + '\u313A': '\u11B0', + '\u313B': '\u11B1', + '\u313C': '\u11B2', + '\u313D': '\u11B3', + '\u313E': '\u11B4', + '\u313F': '\u11B5', + '\u3140': '\u111A', + '\u3141': '\u1106', + '\u3142': '\u1107', + '\u3143': '\u1108', + '\u3144': '\u1121', + '\u3145': '\u1109', + '\u3146': '\u110A', + '\u3147': '\u110B', + '\u3148': '\u110C', + '\u3149': '\u110D', + '\u314A': '\u110E', + '\u314B': '\u110F', + '\u314C': '\u1110', + '\u314D': '\u1111', + '\u314E': '\u1112', + '\u314F': '\u1161', + '\u3150': '\u1162', + '\u3151': '\u1163', + '\u3152': '\u1164', + '\u3153': '\u1165', + '\u3154': '\u1166', + '\u3155': '\u1167', + '\u3156': '\u1168', + '\u3157': '\u1169', + '\u3158': '\u116A', + '\u3159': '\u116B', + '\u315A': '\u116C', + '\u315B': '\u116D', + '\u315C': '\u116E', + '\u315D': '\u116F', + '\u315E': '\u1170', + '\u315F': '\u1171', + '\u3160': '\u1172', + '\u3161': '\u1173', + '\u3162': '\u1174', + '\u3163': '\u1175', + '\u3164': '\u1160', + '\u3165': '\u1114', + '\u3166': '\u1115', + '\u3167': '\u11C7', + '\u3168': '\u11C8', + '\u3169': '\u11CC', + '\u316A': '\u11CE', + '\u316B': '\u11D3', + '\u316C': '\u11D7', + '\u316D': '\u11D9', + '\u316E': '\u111C', + '\u316F': '\u11DD', + '\u3170': '\u11DF', + '\u3171': '\u111D', + '\u3172': '\u111E', + '\u3173': '\u1120', + '\u3174': '\u1122', + '\u3175': '\u1123', + '\u3176': '\u1127', + '\u3177': '\u1129', + '\u3178': '\u112B', + '\u3179': '\u112C', + '\u317A': '\u112D', + '\u317B': '\u112E', + '\u317C': '\u112F', + '\u317D': '\u1132', + '\u317E': '\u1136', + '\u317F': '\u1140', + '\u3180': '\u1147', + '\u3181': '\u114C', + '\u3182': '\u11F1', + '\u3183': '\u11F2', + '\u3184': '\u1157', + '\u3185': '\u1158', + '\u3186': '\u1159', + '\u3187': '\u1184', + '\u3188': '\u1185', + '\u3189': '\u1188', + '\u318A': '\u1191', + '\u318B': '\u1192', + '\u318C': '\u1194', + '\u318D': '\u119E', + '\u318E': '\u11A1', + '\u3200': '\u0028\u1100\u0029', + '\u3201': '\u0028\u1102\u0029', + '\u3202': '\u0028\u1103\u0029', + '\u3203': '\u0028\u1105\u0029', + '\u3204': '\u0028\u1106\u0029', + '\u3205': '\u0028\u1107\u0029', + '\u3206': '\u0028\u1109\u0029', + '\u3207': '\u0028\u110B\u0029', + '\u3208': '\u0028\u110C\u0029', + '\u3209': '\u0028\u110E\u0029', + '\u320A': '\u0028\u110F\u0029', + '\u320B': '\u0028\u1110\u0029', + '\u320C': '\u0028\u1111\u0029', + '\u320D': '\u0028\u1112\u0029', + '\u320E': '\u0028\u1100\u1161\u0029', + '\u320F': '\u0028\u1102\u1161\u0029', + '\u3210': '\u0028\u1103\u1161\u0029', + '\u3211': '\u0028\u1105\u1161\u0029', + '\u3212': '\u0028\u1106\u1161\u0029', + '\u3213': '\u0028\u1107\u1161\u0029', + '\u3214': '\u0028\u1109\u1161\u0029', + '\u3215': '\u0028\u110B\u1161\u0029', + '\u3216': '\u0028\u110C\u1161\u0029', + '\u3217': '\u0028\u110E\u1161\u0029', + '\u3218': '\u0028\u110F\u1161\u0029', + '\u3219': '\u0028\u1110\u1161\u0029', + '\u321A': '\u0028\u1111\u1161\u0029', + '\u321B': '\u0028\u1112\u1161\u0029', + '\u321C': '\u0028\u110C\u116E\u0029', + '\u321D': '\u0028\u110B\u1169\u110C\u1165\u11AB\u0029', + '\u321E': '\u0028\u110B\u1169\u1112\u116E\u0029', + '\u3220': '\u0028\u4E00\u0029', + '\u3221': '\u0028\u4E8C\u0029', + '\u3222': '\u0028\u4E09\u0029', + '\u3223': '\u0028\u56DB\u0029', + '\u3224': '\u0028\u4E94\u0029', + '\u3225': '\u0028\u516D\u0029', + '\u3226': '\u0028\u4E03\u0029', + '\u3227': '\u0028\u516B\u0029', + '\u3228': '\u0028\u4E5D\u0029', + '\u3229': '\u0028\u5341\u0029', + '\u322A': '\u0028\u6708\u0029', + '\u322B': '\u0028\u706B\u0029', + '\u322C': '\u0028\u6C34\u0029', + '\u322D': '\u0028\u6728\u0029', + '\u322E': '\u0028\u91D1\u0029', + '\u322F': '\u0028\u571F\u0029', + '\u3230': '\u0028\u65E5\u0029', + '\u3231': '\u0028\u682A\u0029', + '\u3232': '\u0028\u6709\u0029', + '\u3233': '\u0028\u793E\u0029', + '\u3234': '\u0028\u540D\u0029', + '\u3235': '\u0028\u7279\u0029', + '\u3236': '\u0028\u8CA1\u0029', + '\u3237': '\u0028\u795D\u0029', + '\u3238': '\u0028\u52B4\u0029', + '\u3239': '\u0028\u4EE3\u0029', + '\u323A': '\u0028\u547C\u0029', + '\u323B': '\u0028\u5B66\u0029', + '\u323C': '\u0028\u76E3\u0029', + '\u323D': '\u0028\u4F01\u0029', + '\u323E': '\u0028\u8CC7\u0029', + '\u323F': '\u0028\u5354\u0029', + '\u3240': '\u0028\u796D\u0029', + '\u3241': '\u0028\u4F11\u0029', + '\u3242': '\u0028\u81EA\u0029', + '\u3243': '\u0028\u81F3\u0029', + '\u32C0': '\u0031\u6708', + '\u32C1': '\u0032\u6708', + '\u32C2': '\u0033\u6708', + '\u32C3': '\u0034\u6708', + '\u32C4': '\u0035\u6708', + '\u32C5': '\u0036\u6708', + '\u32C6': '\u0037\u6708', + '\u32C7': '\u0038\u6708', + '\u32C8': '\u0039\u6708', + '\u32C9': '\u0031\u0030\u6708', + '\u32CA': '\u0031\u0031\u6708', + '\u32CB': '\u0031\u0032\u6708', + '\u3358': '\u0030\u70B9', + '\u3359': '\u0031\u70B9', + '\u335A': '\u0032\u70B9', + '\u335B': '\u0033\u70B9', + '\u335C': '\u0034\u70B9', + '\u335D': '\u0035\u70B9', + '\u335E': '\u0036\u70B9', + '\u335F': '\u0037\u70B9', + '\u3360': '\u0038\u70B9', + '\u3361': '\u0039\u70B9', + '\u3362': '\u0031\u0030\u70B9', + '\u3363': '\u0031\u0031\u70B9', + '\u3364': '\u0031\u0032\u70B9', + '\u3365': '\u0031\u0033\u70B9', + '\u3366': '\u0031\u0034\u70B9', + '\u3367': '\u0031\u0035\u70B9', + '\u3368': '\u0031\u0036\u70B9', + '\u3369': '\u0031\u0037\u70B9', + '\u336A': '\u0031\u0038\u70B9', + '\u336B': '\u0031\u0039\u70B9', + '\u336C': '\u0032\u0030\u70B9', + '\u336D': '\u0032\u0031\u70B9', + '\u336E': '\u0032\u0032\u70B9', + '\u336F': '\u0032\u0033\u70B9', + '\u3370': '\u0032\u0034\u70B9', + '\u33E0': '\u0031\u65E5', + '\u33E1': '\u0032\u65E5', + '\u33E2': '\u0033\u65E5', + '\u33E3': '\u0034\u65E5', + '\u33E4': '\u0035\u65E5', + '\u33E5': '\u0036\u65E5', + '\u33E6': '\u0037\u65E5', + '\u33E7': '\u0038\u65E5', + '\u33E8': '\u0039\u65E5', + '\u33E9': '\u0031\u0030\u65E5', + '\u33EA': '\u0031\u0031\u65E5', + '\u33EB': '\u0031\u0032\u65E5', + '\u33EC': '\u0031\u0033\u65E5', + '\u33ED': '\u0031\u0034\u65E5', + '\u33EE': '\u0031\u0035\u65E5', + '\u33EF': '\u0031\u0036\u65E5', + '\u33F0': '\u0031\u0037\u65E5', + '\u33F1': '\u0031\u0038\u65E5', + '\u33F2': '\u0031\u0039\u65E5', + '\u33F3': '\u0032\u0030\u65E5', + '\u33F4': '\u0032\u0031\u65E5', + '\u33F5': '\u0032\u0032\u65E5', + '\u33F6': '\u0032\u0033\u65E5', + '\u33F7': '\u0032\u0034\u65E5', + '\u33F8': '\u0032\u0035\u65E5', + '\u33F9': '\u0032\u0036\u65E5', + '\u33FA': '\u0032\u0037\u65E5', + '\u33FB': '\u0032\u0038\u65E5', + '\u33FC': '\u0032\u0039\u65E5', + '\u33FD': '\u0033\u0030\u65E5', + '\u33FE': '\u0033\u0031\u65E5', + '\uFB00': '\u0066\u0066', + '\uFB01': '\u0066\u0069', + '\uFB02': '\u0066\u006C', + '\uFB03': '\u0066\u0066\u0069', + '\uFB04': '\u0066\u0066\u006C', + '\uFB05': '\u017F\u0074', + '\uFB06': '\u0073\u0074', + '\uFB13': '\u0574\u0576', + '\uFB14': '\u0574\u0565', + '\uFB15': '\u0574\u056B', + '\uFB16': '\u057E\u0576', + '\uFB17': '\u0574\u056D', + '\uFB4F': '\u05D0\u05DC', + '\uFE49': '\u203E', + '\uFE4A': '\u203E', + '\uFE4B': '\u203E', + '\uFE4C': '\u203E', + '\uFE4D': '\u005F', + '\uFE4E': '\u005F', + '\uFE4F': '\u005F' +}; + function fontCharsToUnicode(charCodes, fontProperties) { var toUnicode = fontProperties.toUnicode; var composite = fontProperties.composite; var encoding, differences, cidToUnicode; var result = ''; if (composite) { - cidToUnicode = fontProperties.cidToUnicode + cidToUnicode = fontProperties.cidToUnicode; for (var i = 0, ii = charCodes.length; i < ii; i += 2) { - var charCode = (charCodes.charCodeAt(i) << 8) | charCodes.charCodeAt(i + 1); + var charCode = (charCodes.charCodeAt(i) << 8) | + charCodes.charCodeAt(i + 1); if (toUnicode && charCode in toUnicode) { var unicode = toUnicode[charCode]; result += typeof unicode !== 'number' ? unicode : @@ -739,13 +1418,14 @@ function fontCharsToUnicode(charCodes, fontProperties) { continue; } result += String.fromCharCode(!cidToUnicode ? charCode : - cidToUnicode[charCode] || charCode) + cidToUnicode[charCode] || charCode); } } else { differences = fontProperties.differences; encoding = fontProperties.baseEncoding; for (var i = 0, ii = charCodes.length; i < ii; i++) { var charCode = charCodes.charCodeAt(i); + var unicode; if (toUnicode && charCode in toUnicode) { var unicode = toUnicode[charCode]; result += typeof unicode !== 'number' ? unicode : @@ -759,9 +1439,17 @@ function fontCharsToUnicode(charCodes, fontProperties) { result += String.fromCharCode(GlyphsUnicode[glyphName]); continue; } - result += String.fromCharCode(charCode) + result += String.fromCharCode(charCode); } } + // normalizing the unicode characters + for (var i = 0, ii = result.length; i < ii; i++) { + if (!(result[i] in NormalizedUnicodes)) + continue; + result = result.substring(0, i) + NormalizedUnicodes[result[i]] + + result.substring(i + 1); + ii = result.length; + } return result; } @@ -2254,7 +2942,8 @@ var Font = (function FontClosure() { return { fontChar: String.fromCharCode(fontChar), - unicode: unicodeChars, + unicode: (unicodeChars in NormalizedUnicodes) ? + NormalizedUnicodes[unicodeChars] : unicodeChars, width: width, codeIRQueue: codeIRQueue }; From 3bde084ffd383a3dc4cb2dee2cb3bfc80ba2c65e Mon Sep 17 00:00:00 2001 From: notmasteryet Date: Mon, 19 Dec 2011 18:05:32 -0600 Subject: [PATCH 6/9] add "find" images; function name fix --- src/core.js | 2 +- src/worker.js | 2 +- web/images/edit-find.svg | 750 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 752 insertions(+), 2 deletions(-) create mode 100644 web/images/edit-find.svg diff --git a/src/core.js b/src/core.js index 633a36c1d..b14e27460 100644 --- a/src/core.js +++ b/src/core.js @@ -650,7 +650,7 @@ var PDFDoc = (function PDFDocClosure() { throw data.error; }, this); - messageHandler.on('text_extracted', function pdfDocError(data) { + messageHandler.on('text_extracted', function pdfTextExtracted(data) { var index = data[0]; if (this.textExtracted) this.textExtracted(index); diff --git a/src/worker.js b/src/worker.js index dea6339d1..51fbece08 100644 --- a/src/worker.js +++ b/src/worker.js @@ -203,7 +203,7 @@ var WorkerMessageHandler = { function indexPage(pageNum) { if (pageNum > numPages) { - console.log('text indexing=: time=%dms', Date.now() - start); + console.log('text indexing: time=%dms', Date.now() - start); handler.send('text_extracted', [index]); return; diff --git a/web/images/edit-find.svg b/web/images/edit-find.svg new file mode 100644 index 000000000..a499b486c --- /dev/null +++ b/web/images/edit-find.svg @@ -0,0 +1,750 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + Edit Find + + + edit + find + locate + search + + + + + + Steven Garrity + + + + + + Jakub Steiner + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 5affc0d7946bc85c3bd87dc516cd68d89d562762 Mon Sep 17 00:00:00 2001 From: notmasteryet Date: Tue, 20 Dec 2011 20:04:42 -0600 Subject: [PATCH 7/9] Disable keyboard shortcuts if current control is INPUT element --- web/viewer.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/web/viewer.js b/web/viewer.js index 79df8280d..e0eedaae6 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -964,6 +964,8 @@ window.addEventListener('pagechange', function pagechange(evt) { window.addEventListener('keydown', function keydown(evt) { var curElement = document.activeElement; + if (curElement && curElement.tagName == 'INPUT') + return; var controlsElement = document.getElementById('controls'); while (curElement) { if (curElement === controlsElement) From c9fb5637c3bbb07a250ea4ecb19d79881b275ca2 Mon Sep 17 00:00:00 2001 From: Julian Viereck Date: Sun, 8 Apr 2012 16:18:43 -0700 Subject: [PATCH 8/9] Extract one page after the other and not all pages at once --- src/core.js | 29 +++++++++++++++++++++++++---- src/worker.js | 38 ++++++++++---------------------------- web/viewer.js | 31 ++++++++++++++----------------- 3 files changed, 49 insertions(+), 49 deletions(-) diff --git a/src/core.js b/src/core.js index 4bd2cb234..38f264bf5 100644 --- a/src/core.js +++ b/src/core.js @@ -698,6 +698,9 @@ var PDFDoc = (function PDFDocClosure() { this.fontsLoading = {}; this.workerReadyPromise = new Promise('workerReady'); + this.pageText = []; + this.startedTextExtraction = false; + // If worker support isn't disabled explicit and the browser has worker // support, create a new web worker and test if it/the browser fullfills // all requirements to run parts of pdf.js in a web worker. @@ -769,7 +772,6 @@ var PDFDoc = (function PDFDocClosure() { WorkerMessageHandler.setup(messageHandler); }, - setupMessageHandler: function PDFDoc_setupMessageHandler(messageHandler) { this.messageHandler = messageHandler; @@ -825,9 +827,18 @@ var PDFDoc = (function PDFDocClosure() { }, this); messageHandler.on('text_extracted', function pdfTextExtracted(data) { - var index = data[0]; + var pageNum = data[0]; + var content = data[1]; + if (pageNum !== this.pageText.length + 1) + error('pdfTextExtracted: pageIdx and pageText length got to fit'); + + this.pageText.push(content); + if (this.textExtracted) - this.textExtracted(index); + this.textExtracted(pageNum, content); + + if (pageNum < this.numPages) + this.extractTextPage(pageNum + 1); }, this); messageHandler.on('jpeg_decode', function(data, promise) { @@ -895,9 +906,19 @@ var PDFDoc = (function PDFDocClosure() { return (this.pageCache[n] = page); }, + extractTextPage: function PDFDoc_extractTextPage(pageNum) { + this.messageHandler.send('extract_text', pageNum); + }, + extractText: function PDFDoc_extractText() { + if (this.startedTextExtraction) + return; + + this.startedTextExtraction = true; + this.workerReadyPromise.then(function pdfDocStartRenderingThen() { - this.messageHandler.send('extract_text'); + // Start the text extraction process. + this.extractTextPage(1); }.bind(this)); }, diff --git a/src/worker.js b/src/worker.js index b75fc66e8..b7679bdbe 100644 --- a/src/worker.js +++ b/src/worker.js @@ -94,7 +94,6 @@ var WorkerMessageHandler = { handler.on('page_request', function wphSetupPageRequest(pageNum) { pageNum = parseInt(pageNum); - // The following code does quite the same as // Page.prototype.startRendering, but stops at one point and sends the // result back to the main thread. @@ -156,37 +155,20 @@ var WorkerMessageHandler = { }); }, this); - handler.on('extract_text', function wphExtractText() { - var numPages = pdfModel.numPages; - var index = []; + handler.on('extract_text', function wphExtractText(pageNum) { var start = Date.now(); - function indexPage(pageNum) { - if (pageNum > numPages) { - console.log('text indexing: time=%dms', Date.now() - start); - - handler.send('text_extracted', [index]); - return; - } - - var textContent = ''; - // try { - var page = pdfModel.getPage(pageNum); - textContent = page.extractTextContent(); - // } catch (e) { - // // Skip errored pages - // } - - index.push(textContent); - - // processing one page, interrupting thread to process - // other requests - setTimeout(function extractTextNextPage() { - indexPage(pageNum + 1); - }, 0); + var textContent = ''; + try { + var page = pdfModel.getPage(pageNum); + textContent = page.extractTextContent(); + } catch (e) { + // Skip errored pages } - indexPage(1); + console.log('text indexing: page=%d - time=%dms', + pageNum, Date.now() - start); + handler.send('text_extracted', [pageNum, textContent]); }); } }; diff --git a/web/viewer.js b/web/viewer.js index 91639d9ee..c827b5bce 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -491,7 +491,7 @@ var PDFView = { var pdf; try { - pdf = new PDFJS.PDFDoc(data); + this.pdfDoc = pdf = new PDFJS.PDFDoc(data); } catch (e) { this.error('An error occurred while reading the PDF.', e); } @@ -576,22 +576,18 @@ var PDFView = { if (pdfTitle) document.title = pdfTitle + ' - ' + document.title; - - // loosing pdf reference here, starting text indexing in 500ms - setTimeout((function loadStartTextExtraction() { - this.startTextExtraction(pdf); - }).bind(this), 500); - delete PDFView.extractedText; }, startTextExtraction: function pdfViewStartTextExtraction(pdf) { var searchResults = document.getElementById('searchResults'); searchResults.textContent = ''; - pdf.textExtracted = function pdfTextExtracted(index) { - PDFView.extractedText = index; - }; + pdf.textExtracted = (function pdfTextExtracted(pageIdx, content) { + this.search(); + }).bind(this); pdf.extractText(); + + this.pdfDoc = pdf; }, search: function pdfViewStartSearch() { @@ -604,21 +600,19 @@ var PDFView = { } var searchResults = document.getElementById('searchResults'); - if (!('extractedText' in PDFView)) { - // not indexed yet, repeat in 1 second - searchResults.textContent = 'Searching...'; - setTimeout(this.search.bind(this), 1000); - return; - } var searchTermsInput = document.getElementById('searchTermsInput'); searchResults.removeAttribute('hidden'); searchResults.textContent = ''; var terms = searchTermsInput.value; + + if (!terms) + return; + // simple search: removing spaces and hyphens, then scanning every terms = terms.replace(/\s-/g, '').toLowerCase(); - var index = PDFView.extractedText; + var index = PDFView.pdfDoc.pageText; var pageFound = false; for (var i = 0, ii = index.length; i < ii; i++) { var pageText = index[i].replace(/\s-/g, '').toLowerCase(); @@ -708,6 +702,9 @@ var PDFView = { var searchTermsInput = document.getElementById('searchTermsInput'); searchTermsInput.focus(); + + // Start text extraction as soon as the search gets displayed. + this.pdfDoc.extractText(); } else { searchScrollView.setAttribute('hidden', 'true'); searchSwitchButton.removeAttribute('data-selected'); From 12b27044aa05df0478b9eaf410da5b95c2d3bf16 Mon Sep 17 00:00:00 2001 From: Julian Viereck Date: Sun, 8 Apr 2012 16:31:29 -0700 Subject: [PATCH 9/9] Fix bug such that search is updated one a new page arrives + make the search update happen only very 250ms --- web/viewer.js | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/web/viewer.js b/web/viewer.js index c827b5bce..ec18ea55e 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -576,21 +576,34 @@ var PDFView = { if (pdfTitle) document.title = pdfTitle + ' - ' + document.title; + + pdf.textExtracted = (function pdfTextExtracted(pageIdx, content) { + this.search(); + }).bind(this); }, startTextExtraction: function pdfViewStartTextExtraction(pdf) { var searchResults = document.getElementById('searchResults'); searchResults.textContent = ''; - pdf.textExtracted = (function pdfTextExtracted(pageIdx, content) { - this.search(); - }).bind(this); - pdf.extractText(); - - this.pdfDoc = pdf; + this.pdfDoc.extractText(); }, search: function pdfViewStartSearch() { + // Limit this function to run every ms. + var SEARCH_TIMEOUT = 250; + var lastSeach = this.lastSearch; + var now = Date.now(); + if (lastSeach && (now - lastSeach) < SEARCH_TIMEOUT) { + if (!this.searchTimer) + this.searchTimer = + setTimeout(this.search, SEARCH_TIMEOUT - (now - lastSeach)); + + return; + } + this.searchTimer = null; + this.lastSearch = now; + function bindLink(link, pageNumber) { link.href = '#' + pageNumber; link.onclick = function searchBindLink() {