Remove the invisible format marks from the text chunks
- it aims to fix issue #9186.
This commit is contained in:
		
							parent
							
								
									88236e1163
								
							
						
					
					
						commit
						e1d3a3b414
					
				| @ -2561,6 +2561,9 @@ class PartialEvaluator { | |||||||
| 
 | 
 | ||||||
|       for (let i = 0, ii = glyphs.length; i < ii; i++) { |       for (let i = 0, ii = glyphs.length; i < ii; i++) { | ||||||
|         const glyph = glyphs[i]; |         const glyph = glyphs[i]; | ||||||
|  |         if (glyph.isInvisibleFormatMark) { | ||||||
|  |           continue; | ||||||
|  |         } | ||||||
|         let charSpacing = |         let charSpacing = | ||||||
|           textState.charSpacing + (i + 1 === ii ? extraSpacing : 0); |           textState.charSpacing + (i + 1 === ii ? extraSpacing : 0); | ||||||
| 
 | 
 | ||||||
| @ -2601,7 +2604,7 @@ class PartialEvaluator { | |||||||
|         // Must be called after compareWithLastPosition because
 |         // Must be called after compareWithLastPosition because
 | ||||||
|         // the textContentItem could have been flushed.
 |         // the textContentItem could have been flushed.
 | ||||||
|         const textChunk = ensureTextContentItem(); |         const textChunk = ensureTextContentItem(); | ||||||
|         if (glyph.isDiacritic) { |         if (glyph.isZeroWidthDiacritic) { | ||||||
|           scaledDim = 0; |           scaledDim = 0; | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -216,7 +216,8 @@ class Glyph { | |||||||
| 
 | 
 | ||||||
|     const category = getCharUnicodeCategory(unicode); |     const category = getCharUnicodeCategory(unicode); | ||||||
|     this.isWhitespace = category.isWhitespace; |     this.isWhitespace = category.isWhitespace; | ||||||
|     this.isDiacritic = category.isDiacritic; |     this.isZeroWidthDiacritic = category.isZeroWidthDiacritic; | ||||||
|  |     this.isInvisibleFormatMark = category.isInvisibleFormatMark; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   matchesForCache( |   matchesForCache( | ||||||
|  | |||||||
| @ -1640,12 +1640,13 @@ function reverseIfRtl(chars) { | |||||||
|   return buf.join(""); |   return buf.join(""); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})$", "u"); | const SpecialCharRegExp = new RegExp("^(\\s)|(\\p{Mn})|(\\p{Cf})$", "u"); | ||||||
| function getCharUnicodeCategory(char) { | function getCharUnicodeCategory(char) { | ||||||
|   const groups = char.match(SpecialCharRegExp); |   const groups = char.match(SpecialCharRegExp); | ||||||
|   return { |   return { | ||||||
|     isWhitespace: !!(groups && groups[1]), |     isWhitespace: !!(groups && groups[1]), | ||||||
|     isDiacritic: !!(groups && groups[2]), |     isZeroWidthDiacritic: !!(groups && groups[2]), | ||||||
|  |     isInvisibleFormatMark: !!(groups && groups[3]), | ||||||
|   }; |   }; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										1
									
								
								test/pdfs/issue9186.pdf.link
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								test/pdfs/issue9186.pdf.link
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | |||||||
|  | https://github.com/mozilla/pdf.js/files/1500985/Sample.lease.contract.26.pdf | ||||||
| @ -6230,5 +6230,13 @@ | |||||||
|       "firstPage": 1, |       "firstPage": 1, | ||||||
|       "lastPage": 1, |       "lastPage": 1, | ||||||
|       "type": "eq" |       "type": "eq" | ||||||
|  |    }, | ||||||
|  |    { "id": "issue9186", | ||||||
|  |       "file": "pdfs/issue9186.pdf", | ||||||
|  |       "md5": "d151857bb724ab9a291704a45a0b5d7f", | ||||||
|  |       "rounds": 1, | ||||||
|  |       "link": true, | ||||||
|  |       "lastPage": 1, | ||||||
|  |       "type": "text" | ||||||
|    } |    } | ||||||
| ] | ] | ||||||
|  | |||||||
| @ -2147,6 +2147,29 @@ sozialökonomische Gerechtigkeit.`) | |||||||
|       await loadingTask.destroy(); |       await loadingTask.destroy(); | ||||||
|     }); |     }); | ||||||
| 
 | 
 | ||||||
|  |     it("gets text content, with invisible text marks (issue 9186)", async function () { | ||||||
|  |       if (isNodeJS) { | ||||||
|  |         pending("Linked test-cases are not supported in Node.js."); | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |       const loadingTask = getDocument(buildGetDocumentParams("issue9186.pdf")); | ||||||
|  |       const pdfDoc = await loadingTask.promise; | ||||||
|  |       const pdfPage = await pdfDoc.getPage(1); | ||||||
|  |       const { items } = await pdfPage.getTextContent(); | ||||||
|  |       const text = mergeText(items); | ||||||
|  | 
 | ||||||
|  |       expect( | ||||||
|  |         text.includes(`This Agreement (“Agreement”) is made as of this 25th day of January, 2017, by and
 | ||||||
|  | between EDWARD G. ATSINGER III, not individually but as sole Trustee of the ATSINGER | ||||||
|  | FAMILY TRUST /u/a dated October 31, 1980 as amended, and STUART W. EPPERSON, not | ||||||
|  | individually but solely as Trustee of the STUART W. EPPERSON REVOCABLE LIVING | ||||||
|  | TRUST /u/a dated January 14th 1993 as amended, collectively referred to herein as “Lessor”, and | ||||||
|  | Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
 | ||||||
|  |       ).toEqual(true); | ||||||
|  | 
 | ||||||
|  |       await loadingTask.destroy(); | ||||||
|  |     }); | ||||||
|  | 
 | ||||||
|     it("gets text content, with beginbfrange operator handled correctly (bug 1627427)", async function () { |     it("gets text content, with beginbfrange operator handled correctly (bug 1627427)", async function () { | ||||||
|       const loadingTask = getDocument( |       const loadingTask = getDocument( | ||||||
|         buildGetDocumentParams("bug1627427_reduced.pdf") |         buildGetDocumentParams("bug1627427_reduced.pdf") | ||||||
|  | |||||||
| @ -47,19 +47,67 @@ describe("unicode", function () { | |||||||
|     it("should correctly determine the character category", function () { |     it("should correctly determine the character category", function () { | ||||||
|       const tests = { |       const tests = { | ||||||
|         // Whitespace
 |         // Whitespace
 | ||||||
|         " ": { isDiacritic: false, isWhitespace: true }, |         " ": { | ||||||
|         "\t": { isDiacritic: false, isWhitespace: true }, |           isZeroWidthDiacritic: false, | ||||||
|         "\u2001": { isDiacritic: false, isWhitespace: true }, |           isInvisibleFormatMark: false, | ||||||
|         "\uFEFF": { isDiacritic: false, isWhitespace: true }, |           isWhitespace: true, | ||||||
|  |         }, | ||||||
|  |         "\t": { | ||||||
|  |           isZeroWidthDiacritic: false, | ||||||
|  |           isInvisibleFormatMark: false, | ||||||
|  |           isWhitespace: true, | ||||||
|  |         }, | ||||||
|  |         "\u2001": { | ||||||
|  |           isZeroWidthDiacritic: false, | ||||||
|  |           isInvisibleFormatMark: false, | ||||||
|  |           isWhitespace: true, | ||||||
|  |         }, | ||||||
|  |         "\uFEFF": { | ||||||
|  |           isZeroWidthDiacritic: false, | ||||||
|  |           isInvisibleFormatMark: false, | ||||||
|  |           isWhitespace: true, | ||||||
|  |         }, | ||||||
| 
 | 
 | ||||||
|         // Diacritic
 |         // Diacritic
 | ||||||
|         "\u0302": { isDiacritic: true, isWhitespace: false }, |         "\u0302": { | ||||||
|         "\u0344": { isDiacritic: true, isWhitespace: false }, |           isZeroWidthDiacritic: true, | ||||||
|         "\u0361": { isDiacritic: true, isWhitespace: false }, |           isInvisibleFormatMark: false, | ||||||
|  |           isWhitespace: false, | ||||||
|  |         }, | ||||||
|  |         "\u0344": { | ||||||
|  |           isZeroWidthDiacritic: true, | ||||||
|  |           isInvisibleFormatMark: false, | ||||||
|  |           isWhitespace: false, | ||||||
|  |         }, | ||||||
|  |         "\u0361": { | ||||||
|  |           isZeroWidthDiacritic: true, | ||||||
|  |           isInvisibleFormatMark: false, | ||||||
|  |           isWhitespace: false, | ||||||
|  |         }, | ||||||
| 
 | 
 | ||||||
|         // No whitespace or diacritic
 |         // Invisible format mark
 | ||||||
|         a: { isDiacritic: false, isWhitespace: false }, |         "\u200B": { | ||||||
|         1: { isDiacritic: false, isWhitespace: false }, |           isZeroWidthDiacritic: false, | ||||||
|  |           isInvisibleFormatMark: true, | ||||||
|  |           isWhitespace: false, | ||||||
|  |         }, | ||||||
|  |         "\u200D": { | ||||||
|  |           isZeroWidthDiacritic: false, | ||||||
|  |           isInvisibleFormatMark: true, | ||||||
|  |           isWhitespace: false, | ||||||
|  |         }, | ||||||
|  | 
 | ||||||
|  |         // No whitespace or diacritic or invisible format mark
 | ||||||
|  |         a: { | ||||||
|  |           isZeroWidthDiacritic: false, | ||||||
|  |           isInvisibleFormatMark: false, | ||||||
|  |           isWhitespace: false, | ||||||
|  |         }, | ||||||
|  |         1: { | ||||||
|  |           isZeroWidthDiacritic: false, | ||||||
|  |           isInvisibleFormatMark: false, | ||||||
|  |           isWhitespace: false, | ||||||
|  |         }, | ||||||
|       }; |       }; | ||||||
|       for (const [character, expectation] of Object.entries(tests)) { |       for (const [character, expectation] of Object.entries(tests)) { | ||||||
|         expect(getCharUnicodeCategory(character)).toEqual(expectation); |         expect(getCharUnicodeCategory(character)).toEqual(expectation); | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user