Merge pull request #16200 from calixteman/dont_normalize
[api-minor] Don't normalize the text used in the text layer.
This commit is contained in:
commit
dbe0c4e60c
@ -147,7 +147,11 @@ function bidi(str, startLevel = -1, vertical = false) {
|
||||
if (!charType) {
|
||||
warn("Bidi: invalid Unicode character " + charCode.toString(16));
|
||||
}
|
||||
} else if (0x0700 <= charCode && charCode <= 0x08ac) {
|
||||
} else if (
|
||||
(0x0700 <= charCode && charCode <= 0x08ac) ||
|
||||
(0xfb50 <= charCode && charCode <= 0xfdff) ||
|
||||
(0xfe70 <= charCode && charCode <= 0xfeff)
|
||||
) {
|
||||
charType = "AL";
|
||||
}
|
||||
if (charType === "R" || charType === "AL" || charType === "AN") {
|
||||
|
@ -511,7 +511,13 @@ class Page {
|
||||
});
|
||||
}
|
||||
|
||||
extractTextContent({ handler, task, includeMarkedContent, sink }) {
|
||||
extractTextContent({
|
||||
handler,
|
||||
task,
|
||||
includeMarkedContent,
|
||||
disableNormalization,
|
||||
sink,
|
||||
}) {
|
||||
const contentStreamPromise = this.getContentStream();
|
||||
const resourcesPromise = this.loadResources([
|
||||
"ExtGState",
|
||||
@ -539,6 +545,7 @@ class Page {
|
||||
task,
|
||||
resources: this.resources,
|
||||
includeMarkedContent,
|
||||
disableNormalization,
|
||||
sink,
|
||||
viewBox: this.view,
|
||||
});
|
||||
|
@ -24,6 +24,7 @@ import {
|
||||
IDENTITY_MATRIX,
|
||||
info,
|
||||
isArrayEqual,
|
||||
normalizeUnicode,
|
||||
OPS,
|
||||
shadow,
|
||||
stringToPDFString,
|
||||
@ -2271,6 +2272,7 @@ class PartialEvaluator {
|
||||
seenStyles = new Set(),
|
||||
viewBox,
|
||||
markedContentData = null,
|
||||
disableNormalization = false,
|
||||
}) {
|
||||
// Ensure that `resources`/`stateManager` is correctly initialized,
|
||||
// even if the provided parameter is e.g. `null`.
|
||||
@ -2524,7 +2526,10 @@ class PartialEvaluator {
|
||||
}
|
||||
|
||||
function runBidiTransform(textChunk) {
|
||||
const text = textChunk.str.join("");
|
||||
let text = textChunk.str.join("");
|
||||
if (!disableNormalization) {
|
||||
text = normalizeUnicode(text);
|
||||
}
|
||||
const bidiResult = bidi(text, -1, textChunk.vertical);
|
||||
return {
|
||||
str: bidiResult.str,
|
||||
@ -2859,7 +2864,7 @@ class PartialEvaluator {
|
||||
textChunk.prevTransform = getCurrentTextTransform();
|
||||
}
|
||||
|
||||
const glyphUnicode = glyph.normalizedUnicode;
|
||||
const glyphUnicode = glyph.unicode;
|
||||
if (saveLastChar(glyphUnicode)) {
|
||||
// The two last chars are a non-whitespace followed by a whitespace
|
||||
// and then this non-whitespace, so we insert a whitespace here.
|
||||
@ -3242,6 +3247,7 @@ class PartialEvaluator {
|
||||
seenStyles,
|
||||
viewBox,
|
||||
markedContentData,
|
||||
disableNormalization,
|
||||
})
|
||||
.then(function () {
|
||||
if (!sinkWrapper.enqueueInvoked) {
|
||||
|
@ -33,11 +33,9 @@ import {
|
||||
} from "./fonts_utils.js";
|
||||
import {
|
||||
getCharUnicodeCategory,
|
||||
getNormalizedUnicodes,
|
||||
getUnicodeForGlyph,
|
||||
getUnicodeRangeFor,
|
||||
mapSpecialUnicodeValues,
|
||||
reverseIfRtl,
|
||||
} from "./unicode.js";
|
||||
import { getDingbatsGlyphsUnicode, getGlyphsUnicode } from "./glyphlist.js";
|
||||
import {
|
||||
@ -277,24 +275,6 @@ class Glyph {
|
||||
/* nonSerializable = */ true
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* This property, which is only used by `PartialEvaluator.getTextContent`,
|
||||
* is purposely made non-serializable.
|
||||
* @type {string}
|
||||
*/
|
||||
get normalizedUnicode() {
|
||||
return shadow(
|
||||
this,
|
||||
"normalizedUnicode",
|
||||
reverseIfRtl(Glyph._NormalizedUnicodes[this.unicode] || this.unicode),
|
||||
/* nonSerializable = */ true
|
||||
);
|
||||
}
|
||||
|
||||
static get _NormalizedUnicodes() {
|
||||
return shadow(this, "_NormalizedUnicodes", getNormalizedUnicodes());
|
||||
}
|
||||
}
|
||||
|
||||
function int16(b0, b1) {
|
||||
@ -507,6 +487,9 @@ function adjustMapping(charCodeToGlyphId, hasGlyph, newGlyphZeroId, toUnicode) {
|
||||
const privateUseOffetStart = PRIVATE_USE_AREAS[privateUseAreaIndex][0];
|
||||
let nextAvailableFontCharCode = privateUseOffetStart;
|
||||
let privateUseOffetEnd = PRIVATE_USE_AREAS[privateUseAreaIndex][1];
|
||||
const isInPrivateArea = code =>
|
||||
(PRIVATE_USE_AREAS[0][0] <= code && code <= PRIVATE_USE_AREAS[0][1]) ||
|
||||
(PRIVATE_USE_AREAS[1][0] <= code && code <= PRIVATE_USE_AREAS[1][1]);
|
||||
for (let originalCharCode in charCodeToGlyphId) {
|
||||
originalCharCode |= 0;
|
||||
let glyphId = charCodeToGlyphId[originalCharCode];
|
||||
@ -539,11 +522,7 @@ function adjustMapping(charCodeToGlyphId, hasGlyph, newGlyphZeroId, toUnicode) {
|
||||
if (typeof unicode === "string") {
|
||||
unicode = unicode.codePointAt(0);
|
||||
}
|
||||
if (
|
||||
unicode &&
|
||||
unicode < privateUseOffetStart &&
|
||||
!usedGlyphIds.has(glyphId)
|
||||
) {
|
||||
if (unicode && !isInPrivateArea(unicode) && !usedGlyphIds.has(glyphId)) {
|
||||
toUnicodeExtraMap.set(unicode, glyphId);
|
||||
usedGlyphIds.add(glyphId);
|
||||
}
|
||||
@ -785,6 +764,7 @@ function createOS2Table(properties, charstrings, override) {
|
||||
|
||||
let firstCharIndex = null;
|
||||
let lastCharIndex = 0;
|
||||
let position = -1;
|
||||
|
||||
if (charstrings) {
|
||||
for (let code in charstrings) {
|
||||
@ -796,7 +776,7 @@ function createOS2Table(properties, charstrings, override) {
|
||||
lastCharIndex = code;
|
||||
}
|
||||
|
||||
const position = getUnicodeRangeFor(code);
|
||||
position = getUnicodeRangeFor(code, position);
|
||||
if (position < 32) {
|
||||
ulUnicodeRange1 |= 1 << position;
|
||||
} else if (position < 64) {
|
||||
|
1685
src/core/unicode.js
1685
src/core/unicode.js
File diff suppressed because it is too large
Load Diff
@ -745,7 +745,7 @@ class WorkerMessageHandler {
|
||||
});
|
||||
|
||||
handler.on("GetTextContent", function (data, sink) {
|
||||
const { pageIndex, includeMarkedContent } = data;
|
||||
const { pageIndex, includeMarkedContent, disableNormalization } = data;
|
||||
|
||||
pdfManager.getPage(pageIndex).then(function (page) {
|
||||
const task = new WorkerTask("GetTextContent: page " + pageIndex);
|
||||
@ -760,6 +760,7 @@ class WorkerMessageHandler {
|
||||
task,
|
||||
sink,
|
||||
includeMarkedContent,
|
||||
disableNormalization,
|
||||
})
|
||||
.then(
|
||||
function () {
|
||||
|
@ -1122,6 +1122,8 @@ class PDFDocumentProxy {
|
||||
* @typedef {Object} getTextContentParameters
|
||||
* @property {boolean} [includeMarkedContent] - When true include marked
|
||||
* content items in the items array of TextContent. The default is `false`.
|
||||
* @property {boolean} [disableNormalization] - When true the text is *not*
|
||||
* normalized in the worker-thread. The default is `false`.
|
||||
*/
|
||||
|
||||
/**
|
||||
@ -1598,7 +1600,10 @@ class PDFPageProxy {
|
||||
* @param {getTextContentParameters} params - getTextContent parameters.
|
||||
* @returns {ReadableStream} Stream for reading text content chunks.
|
||||
*/
|
||||
streamTextContent({ includeMarkedContent = false } = {}) {
|
||||
streamTextContent({
|
||||
includeMarkedContent = false,
|
||||
disableNormalization = false,
|
||||
} = {}) {
|
||||
const TEXT_CONTENT_CHUNK_SIZE = 100;
|
||||
|
||||
return this._transport.messageHandler.sendWithStream(
|
||||
@ -1606,6 +1611,7 @@ class PDFPageProxy {
|
||||
{
|
||||
pageIndex: this._pageIndex,
|
||||
includeMarkedContent: includeMarkedContent === true,
|
||||
disableNormalization: disableNormalization === true,
|
||||
},
|
||||
{
|
||||
highWaterMark: TEXT_CONTENT_CHUNK_SIZE,
|
||||
|
@ -35,6 +35,7 @@ import {
|
||||
FeatureTest,
|
||||
InvalidPDFException,
|
||||
MissingPDFException,
|
||||
normalizeUnicode,
|
||||
OPS,
|
||||
PasswordResponses,
|
||||
PermissionFlag,
|
||||
@ -100,6 +101,7 @@ export {
|
||||
isPdfFile,
|
||||
loadScript,
|
||||
MissingPDFException,
|
||||
normalizeUnicode,
|
||||
OPS,
|
||||
PasswordResponses,
|
||||
PDFDataRangeTransport,
|
||||
|
@ -1026,6 +1026,25 @@ function createPromiseCapability() {
|
||||
return capability;
|
||||
}
|
||||
|
||||
let NormalizeRegex = null;
|
||||
let NormalizationMap = null;
|
||||
function normalizeUnicode(str) {
|
||||
if (!NormalizeRegex) {
|
||||
// In order to generate the following regex:
|
||||
// - create a PDF containing all the chars in the range 0000-FFFF with
|
||||
// a NFKC which is different of the char.
|
||||
// - copy and paste all those chars and get the ones where NFKC is
|
||||
// required.
|
||||
// It appears that most the chars here contain some ligatures.
|
||||
NormalizeRegex =
|
||||
/([\u00a0\u00b5\u037e\u0eb3\u2000-\u200a\u202f\u2126\ufb00-\ufb04\ufb06\ufb20-\ufb36\ufb38-\ufb3c\ufb3e\ufb40-\ufb41\ufb43-\ufb44\ufb46-\ufba1\ufba4-\ufba9\ufbae-\ufbb1\ufbd3-\ufbdc\ufbde-\ufbe7\ufbea-\ufbf8\ufbfc-\ufbfd\ufc00-\ufc5d\ufc64-\ufcf1\ufcf5-\ufd3d\ufd88\ufdf4\ufdfa-\ufdfb\ufe71\ufe77\ufe79\ufe7b\ufe7d]+)|(\ufb05+)/gu;
|
||||
NormalizationMap = new Map([["ſt", "ſt"]]);
|
||||
}
|
||||
return str.replaceAll(NormalizeRegex, (_, p1, p2) => {
|
||||
return p1 ? p1.normalize("NFKC") : NormalizationMap.get(p2);
|
||||
});
|
||||
}
|
||||
|
||||
export {
|
||||
AbortException,
|
||||
AnnotationActionEventType,
|
||||
@ -1064,6 +1083,7 @@ export {
|
||||
LINE_FACTOR,
|
||||
MAX_IMAGE_SIZE_TO_CACHE,
|
||||
MissingPDFException,
|
||||
normalizeUnicode,
|
||||
objectFromMap,
|
||||
objectSize,
|
||||
OPS,
|
||||
|
@ -693,6 +693,7 @@ class Driver {
|
||||
initPromise = page
|
||||
.getTextContent({
|
||||
includeMarkedContent: true,
|
||||
disableNormalization: true,
|
||||
})
|
||||
.then(function (textContent) {
|
||||
return Rasterize.textLayer(
|
||||
|
@ -28,7 +28,7 @@ describe("Copy and paste", () => {
|
||||
await closePages(pages);
|
||||
});
|
||||
|
||||
it("must check that we've all the contents", async () => {
|
||||
it("must check that we've all the contents on copy/paste", async () => {
|
||||
await Promise.all(
|
||||
pages.map(async ([browserName, page]) => {
|
||||
await page.keyboard.down("Control");
|
||||
@ -117,4 +117,47 @@ describe("Copy and paste", () => {
|
||||
);
|
||||
});
|
||||
});
|
||||
describe("all text", () => {
|
||||
let pages;
|
||||
|
||||
beforeAll(async () => {
|
||||
pages = await loadAndWait("copy_paste_ligatures.pdf", ".textLayer");
|
||||
await mockClipboard(pages);
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
await closePages(pages);
|
||||
});
|
||||
|
||||
it("must check that we've all the contents on copy/paste", async () => {
|
||||
await Promise.all(
|
||||
pages.map(async ([browserName, page]) => {
|
||||
await page.keyboard.down("Control");
|
||||
await page.keyboard.press("a");
|
||||
await page.keyboard.up("Control");
|
||||
|
||||
await page.waitForTimeout(100);
|
||||
|
||||
await page.keyboard.down("Control");
|
||||
await page.keyboard.press("c");
|
||||
await page.keyboard.up("Control");
|
||||
|
||||
await page.waitForTimeout(100);
|
||||
|
||||
await page.waitForFunction(
|
||||
`document.querySelector('#viewerContainer').style.cursor !== "wait"`
|
||||
);
|
||||
|
||||
const text = await page.evaluate(() =>
|
||||
navigator.clipboard.readText()
|
||||
);
|
||||
|
||||
expect(!!text).withContext(`In ${browserName}`).toEqual(true);
|
||||
expect(text)
|
||||
.withContext(`In ${browserName}`)
|
||||
.toEqual("abcdeffffiflffifflſtstghijklmno");
|
||||
})
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -585,3 +585,4 @@
|
||||
!issue16221.pdf
|
||||
!issue16224.pdf
|
||||
!issue16278.pdf
|
||||
!copy_paste_ligatures.pdf
|
||||
|
BIN
test/pdfs/copy_paste_ligatures.pdf
Executable file
BIN
test/pdfs/copy_paste_ligatures.pdf
Executable file
Binary file not shown.
@ -2340,7 +2340,9 @@ page 1 / 3`);
|
||||
);
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items, styles } = await pdfPage.getTextContent();
|
||||
const { items, styles } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
expect(items.length).toEqual(1);
|
||||
// Font name will be a random object id.
|
||||
const fontName = items[0].fontName;
|
||||
@ -2376,7 +2378,9 @@ page 1 / 3`);
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue13226.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(text).toEqual(
|
||||
@ -2394,7 +2398,9 @@ page 1 / 3`);
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue16119.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(
|
||||
@ -2410,7 +2416,9 @@ page 1 / 3`);
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue13201.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(
|
||||
@ -2436,7 +2444,9 @@ page 1 / 3`);
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue11913.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(
|
||||
@ -2456,7 +2466,9 @@ page 1 / 3`);
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(
|
||||
@ -2475,11 +2487,27 @@ page 1 / 3`);
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue10640.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const text = mergeText(items);
|
||||
let { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
let text = mergeText(items);
|
||||
let expected = `Open Sans is a humanist sans serif typeface designed by Steve Matteson.
|
||||
Open Sans was designed with an upright stress, open forms and a neu-
|
||||
tral, yet friendly appearance. It was optimized for print, web, and mobile
|
||||
interfaces, and has excellent legibility characteristics in its letterforms (see
|
||||
figure \x81 on the following page). This font is available from the Google Font
|
||||
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
|
||||
This package provides support for this font in LATEX. It includes Type \x81
|
||||
versions of the fonts, converted for this package using FontForge from its
|
||||
sources, for full support with Dvips.`;
|
||||
|
||||
expect(
|
||||
text.includes(`Open Sans is a humanist sans serif typeface designed by Steve Matteson.
|
||||
expect(text.includes(expected)).toEqual(true);
|
||||
|
||||
({ items } = await pdfPage.getTextContent({
|
||||
disableNormalization: false,
|
||||
}));
|
||||
text = mergeText(items);
|
||||
expected = `Open Sans is a humanist sans serif typeface designed by Steve Matteson.
|
||||
Open Sans was designed with an upright stress, open forms and a neu-
|
||||
tral, yet friendly appearance. It was optimized for print, web, and mobile
|
||||
interfaces, and has excellent legibility characteristics in its letterforms (see
|
||||
@ -2487,8 +2515,8 @@ figure \x81 on the following page). This font is available from the Google Font
|
||||
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
|
||||
This package provides support for this font in LATEX. It includes Type \x81
|
||||
versions of the fonts, converted for this package using FontForge from its
|
||||
sources, for full support with Dvips.`)
|
||||
).toEqual(true);
|
||||
sources, for full support with Dvips.`;
|
||||
expect(text.includes(expected)).toEqual(true);
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
@ -2501,7 +2529,9 @@ sources, for full support with Dvips.`)
|
||||
const loadingTask = getDocument(buildGetDocumentParams("bug931481.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(
|
||||
@ -2529,7 +2559,9 @@ sozialökonomische Gerechtigkeit.`)
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue9186.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(
|
||||
@ -2550,7 +2582,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
||||
);
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(text).toEqual(
|
||||
@ -2568,7 +2602,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
||||
const loadingTask = getDocument(buildGetDocumentParams("bug1755201.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(6);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(/win aisle/.test(text)).toEqual(false);
|
||||
@ -2586,10 +2622,12 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
||||
const pdfPage = await pdfDoc.getPage(568);
|
||||
let { items } = await pdfPage.getTextContent({
|
||||
includeMarkedContent: false,
|
||||
disableNormalization: true,
|
||||
});
|
||||
const textWithoutMC = mergeText(items);
|
||||
({ items } = await pdfPage.getTextContent({
|
||||
includeMarkedContent: true,
|
||||
disableNormalization: true,
|
||||
}));
|
||||
const textWithMC = mergeText(items);
|
||||
|
||||
@ -2607,7 +2645,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
||||
);
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(text).toEqual("𠮷");
|
||||
@ -2619,7 +2659,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue16221.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const { items } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
|
||||
expect(items.map(i => i.str)).toEqual(["Hello ", "World"]);
|
||||
|
||||
|
@ -542,7 +542,7 @@ describe("pdf_find_controller", function () {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[2743]],
|
||||
pageMatches: [[2734]],
|
||||
pageMatchesLength: [[14]],
|
||||
});
|
||||
});
|
||||
@ -561,7 +561,7 @@ describe("pdf_find_controller", function () {
|
||||
pageIndex: 1,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[], [1493]],
|
||||
pageMatches: [[], [1486]],
|
||||
pageMatchesLength: [[], [11]],
|
||||
});
|
||||
});
|
||||
@ -594,7 +594,7 @@ describe("pdf_find_controller", function () {
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[2087],
|
||||
[2081],
|
||||
],
|
||||
pageMatchesLength: [
|
||||
[24],
|
||||
@ -629,7 +629,7 @@ describe("pdf_find_controller", function () {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[1501]],
|
||||
pageMatches: [[1497]],
|
||||
pageMatchesLength: [[25]],
|
||||
});
|
||||
});
|
||||
@ -670,7 +670,7 @@ describe("pdf_find_controller", function () {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[1946]],
|
||||
pageMatches: [[1941]],
|
||||
pageMatchesLength: [[21]],
|
||||
});
|
||||
});
|
||||
@ -692,7 +692,7 @@ describe("pdf_find_controller", function () {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[1946]],
|
||||
pageMatches: [[1941]],
|
||||
pageMatchesLength: [[23]],
|
||||
});
|
||||
});
|
||||
@ -712,7 +712,7 @@ describe("pdf_find_controller", function () {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[1946]],
|
||||
pageMatches: [[1941]],
|
||||
pageMatchesLength: [[23]],
|
||||
});
|
||||
});
|
||||
@ -976,4 +976,61 @@ describe("pdf_find_controller", function () {
|
||||
pageMatchesLength: [[5, 5]],
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a search in a text with some arabic chars in different unicode ranges but with same normalized form", async function () {
|
||||
const { eventBus, pdfFindController } = await initPdfFindController(
|
||||
"ArabicCIDTrueType.pdf"
|
||||
);
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "\u0629",
|
||||
},
|
||||
matchesPerPage: [4],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[6, 25, 44, 63]],
|
||||
pageMatchesLength: [[1, 1, 1, 1]],
|
||||
});
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "\ufe94",
|
||||
},
|
||||
matchesPerPage: [4],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[6, 25, 44, 63]],
|
||||
pageMatchesLength: [[1, 1, 1, 1]],
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a search in a text with some f ligatures", async function () {
|
||||
const { eventBus, pdfFindController } = await initPdfFindController(
|
||||
"copy_paste_ligatures.pdf"
|
||||
);
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "f",
|
||||
},
|
||||
matchesPerPage: [9],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[5, 6, 6, 7, 8, 9, 9, 10, 10]],
|
||||
pageMatchesLength: [[1, 1, 1, 1, 1, 1, 1, 1, 1]],
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -15,11 +15,9 @@
|
||||
|
||||
import {
|
||||
getCharUnicodeCategory,
|
||||
getNormalizedUnicodes,
|
||||
getUnicodeForGlyph,
|
||||
getUnicodeRangeFor,
|
||||
mapSpecialUnicodeValues,
|
||||
reverseIfRtl,
|
||||
} from "../../src/core/unicode.js";
|
||||
import {
|
||||
getDingbatsGlyphsUnicode,
|
||||
@ -152,69 +150,12 @@ describe("unicode", function () {
|
||||
expect(getUnicodeRangeFor(0x0041)).toEqual(0);
|
||||
// fi (Alphabetic Presentation Forms)
|
||||
expect(getUnicodeRangeFor(0xfb01)).toEqual(62);
|
||||
// Combining diacritic (Cyrillic Extended-A)
|
||||
expect(getUnicodeRangeFor(0x2dff)).toEqual(9);
|
||||
});
|
||||
|
||||
it("should not get a Unicode range", function () {
|
||||
expect(getUnicodeRangeFor(0x05ff)).toEqual(-1);
|
||||
});
|
||||
});
|
||||
|
||||
describe("getNormalizedUnicodes", function () {
|
||||
let NormalizedUnicodes;
|
||||
|
||||
beforeAll(function () {
|
||||
NormalizedUnicodes = getNormalizedUnicodes();
|
||||
});
|
||||
|
||||
afterAll(function () {
|
||||
NormalizedUnicodes = null;
|
||||
});
|
||||
|
||||
it("should get normalized Unicode values for ligatures", function () {
|
||||
// fi => f + i
|
||||
expect(NormalizedUnicodes["\uFB01"]).toEqual("fi");
|
||||
// Arabic
|
||||
expect(NormalizedUnicodes["\u0675"]).toEqual("\u0627\u0674");
|
||||
});
|
||||
|
||||
it("should not normalize standard characters", function () {
|
||||
expect(NormalizedUnicodes.A).toEqual(undefined);
|
||||
});
|
||||
});
|
||||
|
||||
describe("reverseIfRtl", function () {
|
||||
let NormalizedUnicodes;
|
||||
|
||||
function getGlyphUnicode(char) {
|
||||
if (NormalizedUnicodes[char] !== undefined) {
|
||||
return NormalizedUnicodes[char];
|
||||
}
|
||||
return char;
|
||||
}
|
||||
|
||||
beforeAll(function () {
|
||||
NormalizedUnicodes = getNormalizedUnicodes();
|
||||
});
|
||||
|
||||
afterAll(function () {
|
||||
NormalizedUnicodes = null;
|
||||
});
|
||||
|
||||
it("should not reverse LTR characters", function () {
|
||||
const A = getGlyphUnicode("A");
|
||||
expect(reverseIfRtl(A)).toEqual("A");
|
||||
|
||||
const fi = getGlyphUnicode("\uFB01");
|
||||
expect(reverseIfRtl(fi)).toEqual("fi");
|
||||
});
|
||||
|
||||
it("should reverse RTL characters", function () {
|
||||
// Hebrew (no-op, since it's not a combined character)
|
||||
const heAlef = getGlyphUnicode("\u05D0");
|
||||
expect(reverseIfRtl(heAlef)).toEqual("\u05D0");
|
||||
// Arabic
|
||||
const arAlef = getGlyphUnicode("\u0675");
|
||||
expect(reverseIfRtl(arAlef)).toEqual("\u0674\u0627");
|
||||
expect(getUnicodeRangeFor(0xaa60)).toEqual(-1);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -18,8 +18,8 @@
|
||||
/** @typedef {import("./interfaces").IPDFLinkService} IPDFLinkService */
|
||||
|
||||
import { binarySearchFirstItem, scrollIntoView } from "./ui_utils.js";
|
||||
import { getCharacterType, getNormalizeWithNFKC } from "./pdf_find_utils.js";
|
||||
import { createPromiseCapability } from "pdfjs-lib";
|
||||
import { getCharacterType } from "./pdf_find_utils.js";
|
||||
|
||||
const FindState = {
|
||||
FOUND: 0,
|
||||
@ -126,12 +126,7 @@ function normalize(text) {
|
||||
} else {
|
||||
// Compile the regular expression for text normalization once.
|
||||
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
|
||||
const toNormalizeWithNFKC =
|
||||
"\u2460-\u2473" + // Circled numbers.
|
||||
"\u24b6-\u24ff" + // Circled letters/numbers.
|
||||
"\u3244-\u32bf" + // Circled ideograms/numbers.
|
||||
"\u32d0-\u32fe" + // Circled ideograms.
|
||||
"\uff00-\uffef"; // Halfwidth, fullwidth forms.
|
||||
const toNormalizeWithNFKC = getNormalizeWithNFKC();
|
||||
|
||||
// 3040-309F: Hiragana
|
||||
// 30A0-30FF: Katakana
|
||||
@ -840,6 +835,7 @@ class PDFFindController {
|
||||
}
|
||||
|
||||
let promise = Promise.resolve();
|
||||
const textOptions = { disableNormalization: true };
|
||||
for (let i = 0, ii = this._linkService.pagesCount; i < ii; i++) {
|
||||
const extractTextCapability = createPromiseCapability();
|
||||
this._extractTextPromises[i] = extractTextCapability.promise;
|
||||
@ -848,7 +844,7 @@ class PDFFindController {
|
||||
return this._pdfDocument
|
||||
.getPage(i + 1)
|
||||
.then(pdfPage => {
|
||||
return pdfPage.getTextContent();
|
||||
return pdfPage.getTextContent(textOptions);
|
||||
})
|
||||
.then(
|
||||
textContent => {
|
||||
|
@ -112,4 +112,46 @@ function getCharacterType(charCode) {
|
||||
return CharacterType.ALPHA_LETTER;
|
||||
}
|
||||
|
||||
export { CharacterType, getCharacterType };
|
||||
let NormalizeWithNFKC;
|
||||
function getNormalizeWithNFKC() {
|
||||
/* eslint-disable no-irregular-whitespace */
|
||||
NormalizeWithNFKC ||= ` ¨ª¯²-µ¸-º¼-¾IJ-ijĿ-ŀʼnſDŽ-njDZ-dzʰ-ʸ˘-˝ˠ-ˤʹͺ;΄-΅·ϐ-ϖϰ-ϲϴ-ϵϹևٵ-ٸक़-य़ড়-ঢ়য়ਲ਼ਸ਼ਖ਼-ਜ਼ਫ਼ଡ଼-ଢ଼ำຳໜ-ໝ༌གྷཌྷདྷབྷཛྷཀྵჼᴬ-ᴮᴰ-ᴺᴼ-ᵍᵏ-ᵪᵸᶛ-ᶿẚ-ẛάέήίόύώΆ᾽-῁ΈΉ῍-῏ΐΊ῝-῟ΰΎ῭-`ΌΏ´-῾ - ‑‗․-… ″-‴‶-‷‼‾⁇-⁉⁗ ⁰-ⁱ⁴-₎ₐ-ₜ₨℀-℃℅-ℇ℉-ℓℕ-№ℙ-ℝ℠-™ℤΩℨK-ℭℯ-ℱℳ-ℹ℻-⅀ⅅ-ⅉ⅐-ⅿ↉∬-∭∯-∰〈-〉①-⓪⨌⩴-⩶⫝̸ⱼ-ⱽⵯ⺟⻳⼀-⿕ 〶〸-〺゛-゜ゟヿㄱ-ㆎ㆒-㆟㈀-㈞㈠-㉇㉐-㉾㊀-㏿ꚜ-ꚝꝰꟲ-ꟴꟸ-ꟹꭜ-ꭟꭩ豈-嗀塚晴凞-羽蘒諸逸-都飯-舘並-龎ff-stﬓ-ﬗיִײַ-זּטּ-לּמּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-﷼︐-︙︰-﹄﹇-﹒﹔-﹦﹨-﹫ﹰ-ﹲﹴﹶ-ﻼ!-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ¢-₩`;
|
||||
|
||||
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
|
||||
const ranges = [];
|
||||
const range = [];
|
||||
const diacriticsRegex = /^\p{M}$/u;
|
||||
// Some chars must be replaced by their NFKC counterpart during a search.
|
||||
for (let i = 0; i < 65536; i++) {
|
||||
const c = String.fromCharCode(i);
|
||||
if (c.normalize("NFKC") !== c && !diacriticsRegex.test(c)) {
|
||||
if (range.length !== 2) {
|
||||
range[0] = range[1] = i;
|
||||
continue;
|
||||
}
|
||||
if (range[1] + 1 !== i) {
|
||||
if (range[0] === range[1]) {
|
||||
ranges.push(String.fromCharCode(range[0]));
|
||||
} else {
|
||||
ranges.push(
|
||||
`${String.fromCharCode(range[0])}-${String.fromCharCode(
|
||||
range[1]
|
||||
)}`
|
||||
);
|
||||
}
|
||||
range[0] = range[1] = i;
|
||||
} else {
|
||||
range[1] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ranges.join("") !== NormalizeWithNFKC) {
|
||||
throw new Error(
|
||||
"getNormalizeWithNFKC - update the `NormalizeWithNFKC` string."
|
||||
);
|
||||
}
|
||||
}
|
||||
return NormalizeWithNFKC;
|
||||
}
|
||||
|
||||
export { CharacterType, getCharacterType, getNormalizeWithNFKC };
|
||||
|
@ -368,6 +368,7 @@ class PDFPageView {
|
||||
if (!textLayer.renderingDone) {
|
||||
const readableStream = pdfPage.streamTextContent({
|
||||
includeMarkedContent: true,
|
||||
disableNormalization: true,
|
||||
});
|
||||
textLayer.setTextContentSource(readableStream);
|
||||
}
|
||||
|
@ -665,6 +665,8 @@ class PDFViewer {
|
||||
}
|
||||
buffer.length = 0;
|
||||
const page = await this.pdfDocument.getPage(pageNum);
|
||||
// By default getTextContent pass disableNormalization equals to false
|
||||
// which is fine because we want a normalized string.
|
||||
const { items } = await page.getTextContent();
|
||||
for (const item of items) {
|
||||
if (item.str) {
|
||||
|
@ -208,9 +208,20 @@ class TextHighlighter {
|
||||
return;
|
||||
}
|
||||
|
||||
let lastDivIdx = -1;
|
||||
let lastOffset = -1;
|
||||
for (let i = i0; i < i1; i++) {
|
||||
const match = matches[i];
|
||||
const begin = match.begin;
|
||||
if (begin.divIdx === lastDivIdx && begin.offset === lastOffset) {
|
||||
// It's possible to be in this situation if we searched for a 'f' and we
|
||||
// have a ligature 'ff' in the text. The 'ff' has to be highlighted two
|
||||
// times.
|
||||
continue;
|
||||
}
|
||||
lastDivIdx = begin.divIdx;
|
||||
lastOffset = begin.offset;
|
||||
|
||||
const end = match.end;
|
||||
const isSelected = isSelectedPage && i === selectedMatchIdx;
|
||||
const highlightSuffix = isSelected ? " selected" : "";
|
||||
|
@ -20,7 +20,8 @@
|
||||
// eslint-disable-next-line max-len
|
||||
/** @typedef {import("./text_accessibility.js").TextAccessibilityManager} TextAccessibilityManager */
|
||||
|
||||
import { renderTextLayer, updateTextLayer } from "pdfjs-lib";
|
||||
import { normalizeUnicode, renderTextLayer, updateTextLayer } from "pdfjs-lib";
|
||||
import { removeNullCharacters } from "./ui_utils.js";
|
||||
|
||||
/**
|
||||
* @typedef {Object} TextLayerBuilderOptions
|
||||
@ -212,6 +213,16 @@ class TextLayerBuilder {
|
||||
}
|
||||
end.classList.remove("active");
|
||||
});
|
||||
|
||||
div.addEventListener("copy", event => {
|
||||
const selection = document.getSelection();
|
||||
event.clipboardData.setData(
|
||||
"text/plain",
|
||||
removeNullCharacters(normalizeUnicode(selection.toString()))
|
||||
);
|
||||
event.preventDefault();
|
||||
event.stopPropagation();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user