Text char codes extraction
This commit is contained in:
		
							parent
							
								
									853f16085f
								
							
						
					
					
						commit
						3b72c6063c
					
				
							
								
								
									
										50
									
								
								src/core.js
									
									
									
									
									
								
							
							
						
						
									
										50
									
								
								src/core.js
									
									
									
									
									
								
							| @ -200,10 +200,12 @@ var Page = (function PageClosure() { | |||||||
|       if (isArray(content)) { |       if (isArray(content)) { | ||||||
|         // fetching items
 |         // fetching items
 | ||||||
|         var i, n = content.length; |         var i, n = content.length; | ||||||
|  |         var streams = []; | ||||||
|         for (i = 0; i < n; ++i) |         for (i = 0; i < n; ++i) | ||||||
|           content[i] = xref.fetchIfRef(content[i]); |           streams.push(xref.fetchIfRef(content[i])); | ||||||
|         content = new StreamsSequenceStream(content); |         content = new StreamsSequenceStream(streams); | ||||||
|       } |       } else if (isStream(content)) | ||||||
|  |         content.pos = 0; | ||||||
| 
 | 
 | ||||||
|       var pe = this.pe = new PartialEvaluator( |       var pe = this.pe = new PartialEvaluator( | ||||||
|                                 xref, handler, 'p' + this.pageNumber + '_'); |                                 xref, handler, 'p' + this.pageNumber + '_'); | ||||||
| @ -212,6 +214,36 @@ var Page = (function PageClosure() { | |||||||
|                                            dependency)); |                                            dependency)); | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|  |     extractTextContent: function pageExtractPageContent() { | ||||||
|  |       if ('textContent' in this) { | ||||||
|  |         // text content was extracted
 | ||||||
|  |         return this.textContent; | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |       var handler = { | ||||||
|  |         on: function () {}, | ||||||
|  |         send: function() {} | ||||||
|  |       }; | ||||||
|  | 
 | ||||||
|  |       var xref = this.xref; | ||||||
|  |       var content = xref.fetchIfRef(this.content); | ||||||
|  |       var resources = xref.fetchIfRef(this.resources); | ||||||
|  |       if (isArray(content)) { | ||||||
|  |         // fetching items
 | ||||||
|  |         var i, n = content.length; | ||||||
|  |         var streams = []; | ||||||
|  |         for (i = 0; i < n; ++i) | ||||||
|  |           streams.push(xref.fetchIfRef(content[i])); | ||||||
|  |         content = new StreamsSequenceStream(streams); | ||||||
|  |       } else if (isStream(content)) | ||||||
|  |         content.pos = 0; | ||||||
|  | 
 | ||||||
|  |       var pe = new PartialEvaluator( | ||||||
|  |                      xref, handler, 'p' + this.pageNumber + '_'); | ||||||
|  |       var text = pe.getTextContent(content, resources); | ||||||
|  |       return (this.textContent = text); | ||||||
|  |     }, | ||||||
|  | 
 | ||||||
|     ensureFonts: function pageEnsureFonts(fonts, callback) { |     ensureFonts: function pageEnsureFonts(fonts, callback) { | ||||||
|       // Convert the font names to the corresponding font obj.
 |       // Convert the font names to the corresponding font obj.
 | ||||||
|       for (var i = 0, ii = fonts.length; i < ii; i++) { |       for (var i = 0, ii = fonts.length; i < ii; i++) { | ||||||
| @ -614,6 +646,12 @@ var PDFDoc = (function PDFDocClosure() { | |||||||
|           throw data.error; |           throw data.error; | ||||||
|       }, this); |       }, this); | ||||||
| 
 | 
 | ||||||
|  |       messageHandler.on('text_extracted', function pdfDocError(data) { | ||||||
|  |         var index = data.index; | ||||||
|  |         if (this.textExtracted) | ||||||
|  |           this.textExtracted(index); | ||||||
|  |       }, this); | ||||||
|  | 
 | ||||||
|       setTimeout(function pdfDocFontReadySetTimeout() { |       setTimeout(function pdfDocFontReadySetTimeout() { | ||||||
|         messageHandler.send('doc', this.data); |         messageHandler.send('doc', this.data); | ||||||
|         this.workerReadyPromise.resolve(true); |         this.workerReadyPromise.resolve(true); | ||||||
| @ -643,6 +681,12 @@ var PDFDoc = (function PDFDocClosure() { | |||||||
|       return (this.pageCache[n] = page); |       return (this.pageCache[n] = page); | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|  |     extractText: function pdfDocExtractExtractText() { | ||||||
|  |       this.workerReadyPromise.then(function pdfDocStartRenderingThen() { | ||||||
|  |         this.messageHandler.send('extract_text'); | ||||||
|  |       }.bind(this)); | ||||||
|  |     }, | ||||||
|  | 
 | ||||||
|     destroy: function pdfDocDestroy() { |     destroy: function pdfDocDestroy() { | ||||||
|       if (this.worker) |       if (this.worker) | ||||||
|         this.worker.terminate(); |         this.worker.terminate(); | ||||||
|  | |||||||
| @ -144,7 +144,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | |||||||
|         fontRef = fontRef || fontRes.get(fontName); |         fontRef = fontRef || fontRes.get(fontName); | ||||||
|         var font = xref.fetchIfRef(fontRef); |         var font = xref.fetchIfRef(fontRef); | ||||||
|         assertWellFormed(isDict(font)); |         assertWellFormed(isDict(font)); | ||||||
|         if (!font.translated) { |         if (!font.loadedName) { | ||||||
|           font.translated = self.translateFont(font, xref, resources, |           font.translated = self.translateFont(font, xref, resources, | ||||||
|                                                dependency); |                                                dependency); | ||||||
|           if (font.translated) { |           if (font.translated) { | ||||||
| @ -464,6 +464,65 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | |||||||
|       }; |       }; | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|  |     getTextContent: function partialEvaluatorGetIRQueue(stream, resources) { | ||||||
|  | 
 | ||||||
|  |       var self = this; | ||||||
|  |       var xref = this.xref; | ||||||
|  | 
 | ||||||
|  |       function handleSetFont(fontName, fontRef) { | ||||||
|  |         var fontRes = resources.get('Font'); | ||||||
|  | 
 | ||||||
|  |         // TODO: TOASK: Is it possible to get here? If so, what does
 | ||||||
|  |         // args[0].name should be like???
 | ||||||
|  |         assert(fontRes, 'fontRes not available'); | ||||||
|  | 
 | ||||||
|  |         fontRes = xref.fetchIfRef(fontRes); | ||||||
|  |         fontRef = fontRef || fontRes.get(fontName); | ||||||
|  |         var font = xref.fetchIfRef(fontRef), tra; | ||||||
|  |         assertWellFormed(isDict(font)); | ||||||
|  |         if (!font.translated) { | ||||||
|  |           font.translated = self.translateFont(font, xref, resources); | ||||||
|  |         } | ||||||
|  |         return font; | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |       resources = xref.fetchIfRef(resources) || new Dict(); | ||||||
|  | 
 | ||||||
|  |       var parser = new Parser(new Lexer(stream), false); | ||||||
|  |       var res = resources; | ||||||
|  |       var args = [], obj; | ||||||
|  | 
 | ||||||
|  |       var text = ''; | ||||||
|  |       var font = null; | ||||||
|  |       while (!isEOF(obj = parser.getObj())) { | ||||||
|  |         if (isCmd(obj)) { | ||||||
|  |           var cmd = obj.cmd; | ||||||
|  |           switch (cmd) { | ||||||
|  |             case 'Tf': | ||||||
|  |               font = handleSetFont(args[0].name); | ||||||
|  |               break; | ||||||
|  |             case 'TJ': | ||||||
|  |               var items = args[0]; | ||||||
|  |               for (var j = 0, jj = items.length; j < jj; j++) { | ||||||
|  |                 if (typeof items[j] === 'string') | ||||||
|  |                   text += items[j]; | ||||||
|  |               } | ||||||
|  |               break; | ||||||
|  |             case 'Tj': | ||||||
|  |               text += args[0]; | ||||||
|  |               break; | ||||||
|  |           } // switch
 | ||||||
|  | 
 | ||||||
|  |           args = []; | ||||||
|  |         } else if (obj != null) { | ||||||
|  |           assertWellFormed(args.length <= 33, 'Too many arguments'); | ||||||
|  |           args.push(obj); | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |       return text; | ||||||
|  |     }, | ||||||
|  | 
 | ||||||
|     extractDataStructures: function |     extractDataStructures: function | ||||||
|       partialEvaluatorExtractDataStructures(dict, baseDict, |       partialEvaluatorExtractDataStructures(dict, baseDict, | ||||||
|                                             xref, properties) { |                                             xref, properties) { | ||||||
| @ -837,15 +896,18 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | |||||||
| 
 | 
 | ||||||
|       if (type.name === 'Type3') { |       if (type.name === 'Type3') { | ||||||
|         properties.coded = true; |         properties.coded = true; | ||||||
|         var charProcs = xref.fetchIfRef(dict.get('CharProcs')); |         // read char procs only if dependency is specified
 | ||||||
|         var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources; |         if (dependency) { | ||||||
|         properties.resources = fontResources; |           var charProcs = xref.fetchIfRef(dict.get('CharProcs')); | ||||||
|         properties.charProcIRQueues = {}; |           var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources; | ||||||
|         for (var key in charProcs.map) { |           properties.resources = fontResources; | ||||||
|           var glyphStream = xref.fetchIfRef(charProcs.map[key]); |           properties.charProcIRQueues = {}; | ||||||
|           var queueObj = {}; |           for (var key in charProcs.map) { | ||||||
|           properties.charProcIRQueues[key] = |             var glyphStream = xref.fetchIfRef(charProcs.map[key]); | ||||||
|             this.getIRQueue(glyphStream, fontResources, queueObj, dependency); |             var queueObj = {}; | ||||||
|  |             properties.charProcIRQueues[key] = | ||||||
|  |               this.getIRQueue(glyphStream, fontResources, queueObj, dependency); | ||||||
|  |           } | ||||||
|         } |         } | ||||||
|       } |       } | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -160,6 +160,28 @@ var WorkerMessageHandler = { | |||||||
| 
 | 
 | ||||||
|       handler.send('font_ready', [objId, obj]); |       handler.send('font_ready', [objId, obj]); | ||||||
|     }); |     }); | ||||||
|  | 
 | ||||||
|  |     handler.on('extract_text', function wphExtractText() { | ||||||
|  |       var numPages = pdfDoc.numPages; | ||||||
|  |       var index = []; | ||||||
|  |       for (var i = 0; i < numPages; i++) { | ||||||
|  |         var start = Date.now(); | ||||||
|  | 
 | ||||||
|  |         var textContent = ''; | ||||||
|  |         try { | ||||||
|  |           var page = pdfDoc.getPage(i + 1); | ||||||
|  |           textContent = page.extractTextContent(); | ||||||
|  |         } catch (e) { | ||||||
|  |           // Skip errored pages
 | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         index.push(textContent); | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |       console.log('text indexing=: time=%dms', Date.now() - start); | ||||||
|  | 
 | ||||||
|  |       handler.send('text_extracted', { index: index }); | ||||||
|  |     }); | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -309,6 +309,17 @@ var PDFView = { | |||||||
|     } |     } | ||||||
|     else |     else | ||||||
|       this.page = 1; |       this.page = 1; | ||||||
|  | 
 | ||||||
|  |     setTimeout((function loadStartTextExtraction() { | ||||||
|  |       this.startTextExtraction(pdf); | ||||||
|  |     }).bind(this), 500); | ||||||
|  |   }, | ||||||
|  | 
 | ||||||
|  |   startTextExtraction: function(pdf) { | ||||||
|  |     pdf.textExtracted = function pdfTextExtracted(index) { | ||||||
|  |       console.log(index.join()); | ||||||
|  |     }; | ||||||
|  |     pdf.extractText(); | ||||||
|   }, |   }, | ||||||
| 
 | 
 | ||||||
|   setHash: function pdfViewSetHash(hash) { |   setHash: function pdfViewSetHash(hash) { | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user