Text char codes extraction
This commit is contained in:
		
							parent
							
								
									853f16085f
								
							
						
					
					
						commit
						3b72c6063c
					
				
							
								
								
									
										50
									
								
								src/core.js
									
									
									
									
									
								
							
							
						
						
									
										50
									
								
								src/core.js
									
									
									
									
									
								
							| @ -200,10 +200,12 @@ var Page = (function PageClosure() { | ||||
|       if (isArray(content)) { | ||||
|         // fetching items
 | ||||
|         var i, n = content.length; | ||||
|         var streams = []; | ||||
|         for (i = 0; i < n; ++i) | ||||
|           content[i] = xref.fetchIfRef(content[i]); | ||||
|         content = new StreamsSequenceStream(content); | ||||
|       } | ||||
|           streams.push(xref.fetchIfRef(content[i])); | ||||
|         content = new StreamsSequenceStream(streams); | ||||
|       } else if (isStream(content)) | ||||
|         content.pos = 0; | ||||
| 
 | ||||
|       var pe = this.pe = new PartialEvaluator( | ||||
|                                 xref, handler, 'p' + this.pageNumber + '_'); | ||||
| @ -212,6 +214,36 @@ var Page = (function PageClosure() { | ||||
|                                            dependency)); | ||||
|     }, | ||||
| 
 | ||||
|     extractTextContent: function pageExtractPageContent() { | ||||
|       if ('textContent' in this) { | ||||
|         // text content was extracted
 | ||||
|         return this.textContent; | ||||
|       } | ||||
| 
 | ||||
|       var handler = { | ||||
|         on: function () {}, | ||||
|         send: function() {} | ||||
|       }; | ||||
| 
 | ||||
|       var xref = this.xref; | ||||
|       var content = xref.fetchIfRef(this.content); | ||||
|       var resources = xref.fetchIfRef(this.resources); | ||||
|       if (isArray(content)) { | ||||
|         // fetching items
 | ||||
|         var i, n = content.length; | ||||
|         var streams = []; | ||||
|         for (i = 0; i < n; ++i) | ||||
|           streams.push(xref.fetchIfRef(content[i])); | ||||
|         content = new StreamsSequenceStream(streams); | ||||
|       } else if (isStream(content)) | ||||
|         content.pos = 0; | ||||
| 
 | ||||
|       var pe = new PartialEvaluator( | ||||
|                      xref, handler, 'p' + this.pageNumber + '_'); | ||||
|       var text = pe.getTextContent(content, resources); | ||||
|       return (this.textContent = text); | ||||
|     }, | ||||
| 
 | ||||
|     ensureFonts: function pageEnsureFonts(fonts, callback) { | ||||
|       // Convert the font names to the corresponding font obj.
 | ||||
|       for (var i = 0, ii = fonts.length; i < ii; i++) { | ||||
| @ -614,6 +646,12 @@ var PDFDoc = (function PDFDocClosure() { | ||||
|           throw data.error; | ||||
|       }, this); | ||||
| 
 | ||||
|       messageHandler.on('text_extracted', function pdfDocError(data) { | ||||
|         var index = data.index; | ||||
|         if (this.textExtracted) | ||||
|           this.textExtracted(index); | ||||
|       }, this); | ||||
| 
 | ||||
|       setTimeout(function pdfDocFontReadySetTimeout() { | ||||
|         messageHandler.send('doc', this.data); | ||||
|         this.workerReadyPromise.resolve(true); | ||||
| @ -643,6 +681,12 @@ var PDFDoc = (function PDFDocClosure() { | ||||
|       return (this.pageCache[n] = page); | ||||
|     }, | ||||
| 
 | ||||
|     extractText: function pdfDocExtractExtractText() { | ||||
|       this.workerReadyPromise.then(function pdfDocStartRenderingThen() { | ||||
|         this.messageHandler.send('extract_text'); | ||||
|       }.bind(this)); | ||||
|     }, | ||||
| 
 | ||||
|     destroy: function pdfDocDestroy() { | ||||
|       if (this.worker) | ||||
|         this.worker.terminate(); | ||||
|  | ||||
| @ -144,7 +144,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | ||||
|         fontRef = fontRef || fontRes.get(fontName); | ||||
|         var font = xref.fetchIfRef(fontRef); | ||||
|         assertWellFormed(isDict(font)); | ||||
|         if (!font.translated) { | ||||
|         if (!font.loadedName) { | ||||
|           font.translated = self.translateFont(font, xref, resources, | ||||
|                                                dependency); | ||||
|           if (font.translated) { | ||||
| @ -464,6 +464,65 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | ||||
|       }; | ||||
|     }, | ||||
| 
 | ||||
|     getTextContent: function partialEvaluatorGetIRQueue(stream, resources) { | ||||
| 
 | ||||
|       var self = this; | ||||
|       var xref = this.xref; | ||||
| 
 | ||||
|       function handleSetFont(fontName, fontRef) { | ||||
|         var fontRes = resources.get('Font'); | ||||
| 
 | ||||
|         // TODO: TOASK: Is it possible to get here? If so, what does
 | ||||
|         // args[0].name should be like???
 | ||||
|         assert(fontRes, 'fontRes not available'); | ||||
| 
 | ||||
|         fontRes = xref.fetchIfRef(fontRes); | ||||
|         fontRef = fontRef || fontRes.get(fontName); | ||||
|         var font = xref.fetchIfRef(fontRef), tra; | ||||
|         assertWellFormed(isDict(font)); | ||||
|         if (!font.translated) { | ||||
|           font.translated = self.translateFont(font, xref, resources); | ||||
|         } | ||||
|         return font; | ||||
|       } | ||||
| 
 | ||||
|       resources = xref.fetchIfRef(resources) || new Dict(); | ||||
| 
 | ||||
|       var parser = new Parser(new Lexer(stream), false); | ||||
|       var res = resources; | ||||
|       var args = [], obj; | ||||
| 
 | ||||
|       var text = ''; | ||||
|       var font = null; | ||||
|       while (!isEOF(obj = parser.getObj())) { | ||||
|         if (isCmd(obj)) { | ||||
|           var cmd = obj.cmd; | ||||
|           switch (cmd) { | ||||
|             case 'Tf': | ||||
|               font = handleSetFont(args[0].name); | ||||
|               break; | ||||
|             case 'TJ': | ||||
|               var items = args[0]; | ||||
|               for (var j = 0, jj = items.length; j < jj; j++) { | ||||
|                 if (typeof items[j] === 'string') | ||||
|                   text += items[j]; | ||||
|               } | ||||
|               break; | ||||
|             case 'Tj': | ||||
|               text += args[0]; | ||||
|               break; | ||||
|           } // switch
 | ||||
| 
 | ||||
|           args = []; | ||||
|         } else if (obj != null) { | ||||
|           assertWellFormed(args.length <= 33, 'Too many arguments'); | ||||
|           args.push(obj); | ||||
|         } | ||||
|       } | ||||
| 
 | ||||
|       return text; | ||||
|     }, | ||||
| 
 | ||||
|     extractDataStructures: function | ||||
|       partialEvaluatorExtractDataStructures(dict, baseDict, | ||||
|                                             xref, properties) { | ||||
| @ -837,15 +896,18 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { | ||||
| 
 | ||||
|       if (type.name === 'Type3') { | ||||
|         properties.coded = true; | ||||
|         var charProcs = xref.fetchIfRef(dict.get('CharProcs')); | ||||
|         var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources; | ||||
|         properties.resources = fontResources; | ||||
|         properties.charProcIRQueues = {}; | ||||
|         for (var key in charProcs.map) { | ||||
|           var glyphStream = xref.fetchIfRef(charProcs.map[key]); | ||||
|           var queueObj = {}; | ||||
|           properties.charProcIRQueues[key] = | ||||
|             this.getIRQueue(glyphStream, fontResources, queueObj, dependency); | ||||
|         // read char procs only if dependency is specified
 | ||||
|         if (dependency) { | ||||
|           var charProcs = xref.fetchIfRef(dict.get('CharProcs')); | ||||
|           var fontResources = xref.fetchIfRef(dict.get('Resources')) || resources; | ||||
|           properties.resources = fontResources; | ||||
|           properties.charProcIRQueues = {}; | ||||
|           for (var key in charProcs.map) { | ||||
|             var glyphStream = xref.fetchIfRef(charProcs.map[key]); | ||||
|             var queueObj = {}; | ||||
|             properties.charProcIRQueues[key] = | ||||
|               this.getIRQueue(glyphStream, fontResources, queueObj, dependency); | ||||
|           } | ||||
|         } | ||||
|       } | ||||
| 
 | ||||
|  | ||||
| @ -160,6 +160,28 @@ var WorkerMessageHandler = { | ||||
| 
 | ||||
|       handler.send('font_ready', [objId, obj]); | ||||
|     }); | ||||
| 
 | ||||
|     handler.on('extract_text', function wphExtractText() { | ||||
|       var numPages = pdfDoc.numPages; | ||||
|       var index = []; | ||||
|       for (var i = 0; i < numPages; i++) { | ||||
|         var start = Date.now(); | ||||
| 
 | ||||
|         var textContent = ''; | ||||
|         try { | ||||
|           var page = pdfDoc.getPage(i + 1); | ||||
|           textContent = page.extractTextContent(); | ||||
|         } catch (e) { | ||||
|           // Skip errored pages
 | ||||
|         } | ||||
| 
 | ||||
|         index.push(textContent); | ||||
|       } | ||||
| 
 | ||||
|       console.log('text indexing=: time=%dms', Date.now() - start); | ||||
| 
 | ||||
|       handler.send('text_extracted', { index: index }); | ||||
|     }); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
|  | ||||
| @ -309,6 +309,17 @@ var PDFView = { | ||||
|     } | ||||
|     else | ||||
|       this.page = 1; | ||||
| 
 | ||||
|     setTimeout((function loadStartTextExtraction() { | ||||
|       this.startTextExtraction(pdf); | ||||
|     }).bind(this), 500); | ||||
|   }, | ||||
| 
 | ||||
|   startTextExtraction: function(pdf) { | ||||
|     pdf.textExtracted = function pdfTextExtracted(index) { | ||||
|       console.log(index.join()); | ||||
|     }; | ||||
|     pdf.extractText(); | ||||
|   }, | ||||
| 
 | ||||
|   setHash: function pdfViewSetHash(hash) { | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user