Merge pull request #16200 from calixteman/dont_normalize
[api-minor] Don't normalize the text used in the text layer.
This commit is contained in:
commit
dbe0c4e60c
@ -147,7 +147,11 @@ function bidi(str, startLevel = -1, vertical = false) {
|
|||||||
if (!charType) {
|
if (!charType) {
|
||||||
warn("Bidi: invalid Unicode character " + charCode.toString(16));
|
warn("Bidi: invalid Unicode character " + charCode.toString(16));
|
||||||
}
|
}
|
||||||
} else if (0x0700 <= charCode && charCode <= 0x08ac) {
|
} else if (
|
||||||
|
(0x0700 <= charCode && charCode <= 0x08ac) ||
|
||||||
|
(0xfb50 <= charCode && charCode <= 0xfdff) ||
|
||||||
|
(0xfe70 <= charCode && charCode <= 0xfeff)
|
||||||
|
) {
|
||||||
charType = "AL";
|
charType = "AL";
|
||||||
}
|
}
|
||||||
if (charType === "R" || charType === "AL" || charType === "AN") {
|
if (charType === "R" || charType === "AL" || charType === "AN") {
|
||||||
|
@ -511,7 +511,13 @@ class Page {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
extractTextContent({ handler, task, includeMarkedContent, sink }) {
|
extractTextContent({
|
||||||
|
handler,
|
||||||
|
task,
|
||||||
|
includeMarkedContent,
|
||||||
|
disableNormalization,
|
||||||
|
sink,
|
||||||
|
}) {
|
||||||
const contentStreamPromise = this.getContentStream();
|
const contentStreamPromise = this.getContentStream();
|
||||||
const resourcesPromise = this.loadResources([
|
const resourcesPromise = this.loadResources([
|
||||||
"ExtGState",
|
"ExtGState",
|
||||||
@ -539,6 +545,7 @@ class Page {
|
|||||||
task,
|
task,
|
||||||
resources: this.resources,
|
resources: this.resources,
|
||||||
includeMarkedContent,
|
includeMarkedContent,
|
||||||
|
disableNormalization,
|
||||||
sink,
|
sink,
|
||||||
viewBox: this.view,
|
viewBox: this.view,
|
||||||
});
|
});
|
||||||
|
@ -24,6 +24,7 @@ import {
|
|||||||
IDENTITY_MATRIX,
|
IDENTITY_MATRIX,
|
||||||
info,
|
info,
|
||||||
isArrayEqual,
|
isArrayEqual,
|
||||||
|
normalizeUnicode,
|
||||||
OPS,
|
OPS,
|
||||||
shadow,
|
shadow,
|
||||||
stringToPDFString,
|
stringToPDFString,
|
||||||
@ -2271,6 +2272,7 @@ class PartialEvaluator {
|
|||||||
seenStyles = new Set(),
|
seenStyles = new Set(),
|
||||||
viewBox,
|
viewBox,
|
||||||
markedContentData = null,
|
markedContentData = null,
|
||||||
|
disableNormalization = false,
|
||||||
}) {
|
}) {
|
||||||
// Ensure that `resources`/`stateManager` is correctly initialized,
|
// Ensure that `resources`/`stateManager` is correctly initialized,
|
||||||
// even if the provided parameter is e.g. `null`.
|
// even if the provided parameter is e.g. `null`.
|
||||||
@ -2524,7 +2526,10 @@ class PartialEvaluator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function runBidiTransform(textChunk) {
|
function runBidiTransform(textChunk) {
|
||||||
const text = textChunk.str.join("");
|
let text = textChunk.str.join("");
|
||||||
|
if (!disableNormalization) {
|
||||||
|
text = normalizeUnicode(text);
|
||||||
|
}
|
||||||
const bidiResult = bidi(text, -1, textChunk.vertical);
|
const bidiResult = bidi(text, -1, textChunk.vertical);
|
||||||
return {
|
return {
|
||||||
str: bidiResult.str,
|
str: bidiResult.str,
|
||||||
@ -2859,7 +2864,7 @@ class PartialEvaluator {
|
|||||||
textChunk.prevTransform = getCurrentTextTransform();
|
textChunk.prevTransform = getCurrentTextTransform();
|
||||||
}
|
}
|
||||||
|
|
||||||
const glyphUnicode = glyph.normalizedUnicode;
|
const glyphUnicode = glyph.unicode;
|
||||||
if (saveLastChar(glyphUnicode)) {
|
if (saveLastChar(glyphUnicode)) {
|
||||||
// The two last chars are a non-whitespace followed by a whitespace
|
// The two last chars are a non-whitespace followed by a whitespace
|
||||||
// and then this non-whitespace, so we insert a whitespace here.
|
// and then this non-whitespace, so we insert a whitespace here.
|
||||||
@ -3242,6 +3247,7 @@ class PartialEvaluator {
|
|||||||
seenStyles,
|
seenStyles,
|
||||||
viewBox,
|
viewBox,
|
||||||
markedContentData,
|
markedContentData,
|
||||||
|
disableNormalization,
|
||||||
})
|
})
|
||||||
.then(function () {
|
.then(function () {
|
||||||
if (!sinkWrapper.enqueueInvoked) {
|
if (!sinkWrapper.enqueueInvoked) {
|
||||||
|
@ -33,11 +33,9 @@ import {
|
|||||||
} from "./fonts_utils.js";
|
} from "./fonts_utils.js";
|
||||||
import {
|
import {
|
||||||
getCharUnicodeCategory,
|
getCharUnicodeCategory,
|
||||||
getNormalizedUnicodes,
|
|
||||||
getUnicodeForGlyph,
|
getUnicodeForGlyph,
|
||||||
getUnicodeRangeFor,
|
getUnicodeRangeFor,
|
||||||
mapSpecialUnicodeValues,
|
mapSpecialUnicodeValues,
|
||||||
reverseIfRtl,
|
|
||||||
} from "./unicode.js";
|
} from "./unicode.js";
|
||||||
import { getDingbatsGlyphsUnicode, getGlyphsUnicode } from "./glyphlist.js";
|
import { getDingbatsGlyphsUnicode, getGlyphsUnicode } from "./glyphlist.js";
|
||||||
import {
|
import {
|
||||||
@ -277,24 +275,6 @@ class Glyph {
|
|||||||
/* nonSerializable = */ true
|
/* nonSerializable = */ true
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* This property, which is only used by `PartialEvaluator.getTextContent`,
|
|
||||||
* is purposely made non-serializable.
|
|
||||||
* @type {string}
|
|
||||||
*/
|
|
||||||
get normalizedUnicode() {
|
|
||||||
return shadow(
|
|
||||||
this,
|
|
||||||
"normalizedUnicode",
|
|
||||||
reverseIfRtl(Glyph._NormalizedUnicodes[this.unicode] || this.unicode),
|
|
||||||
/* nonSerializable = */ true
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
static get _NormalizedUnicodes() {
|
|
||||||
return shadow(this, "_NormalizedUnicodes", getNormalizedUnicodes());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function int16(b0, b1) {
|
function int16(b0, b1) {
|
||||||
@ -507,6 +487,9 @@ function adjustMapping(charCodeToGlyphId, hasGlyph, newGlyphZeroId, toUnicode) {
|
|||||||
const privateUseOffetStart = PRIVATE_USE_AREAS[privateUseAreaIndex][0];
|
const privateUseOffetStart = PRIVATE_USE_AREAS[privateUseAreaIndex][0];
|
||||||
let nextAvailableFontCharCode = privateUseOffetStart;
|
let nextAvailableFontCharCode = privateUseOffetStart;
|
||||||
let privateUseOffetEnd = PRIVATE_USE_AREAS[privateUseAreaIndex][1];
|
let privateUseOffetEnd = PRIVATE_USE_AREAS[privateUseAreaIndex][1];
|
||||||
|
const isInPrivateArea = code =>
|
||||||
|
(PRIVATE_USE_AREAS[0][0] <= code && code <= PRIVATE_USE_AREAS[0][1]) ||
|
||||||
|
(PRIVATE_USE_AREAS[1][0] <= code && code <= PRIVATE_USE_AREAS[1][1]);
|
||||||
for (let originalCharCode in charCodeToGlyphId) {
|
for (let originalCharCode in charCodeToGlyphId) {
|
||||||
originalCharCode |= 0;
|
originalCharCode |= 0;
|
||||||
let glyphId = charCodeToGlyphId[originalCharCode];
|
let glyphId = charCodeToGlyphId[originalCharCode];
|
||||||
@ -539,11 +522,7 @@ function adjustMapping(charCodeToGlyphId, hasGlyph, newGlyphZeroId, toUnicode) {
|
|||||||
if (typeof unicode === "string") {
|
if (typeof unicode === "string") {
|
||||||
unicode = unicode.codePointAt(0);
|
unicode = unicode.codePointAt(0);
|
||||||
}
|
}
|
||||||
if (
|
if (unicode && !isInPrivateArea(unicode) && !usedGlyphIds.has(glyphId)) {
|
||||||
unicode &&
|
|
||||||
unicode < privateUseOffetStart &&
|
|
||||||
!usedGlyphIds.has(glyphId)
|
|
||||||
) {
|
|
||||||
toUnicodeExtraMap.set(unicode, glyphId);
|
toUnicodeExtraMap.set(unicode, glyphId);
|
||||||
usedGlyphIds.add(glyphId);
|
usedGlyphIds.add(glyphId);
|
||||||
}
|
}
|
||||||
@ -785,6 +764,7 @@ function createOS2Table(properties, charstrings, override) {
|
|||||||
|
|
||||||
let firstCharIndex = null;
|
let firstCharIndex = null;
|
||||||
let lastCharIndex = 0;
|
let lastCharIndex = 0;
|
||||||
|
let position = -1;
|
||||||
|
|
||||||
if (charstrings) {
|
if (charstrings) {
|
||||||
for (let code in charstrings) {
|
for (let code in charstrings) {
|
||||||
@ -796,7 +776,7 @@ function createOS2Table(properties, charstrings, override) {
|
|||||||
lastCharIndex = code;
|
lastCharIndex = code;
|
||||||
}
|
}
|
||||||
|
|
||||||
const position = getUnicodeRangeFor(code);
|
position = getUnicodeRangeFor(code, position);
|
||||||
if (position < 32) {
|
if (position < 32) {
|
||||||
ulUnicodeRange1 |= 1 << position;
|
ulUnicodeRange1 |= 1 << position;
|
||||||
} else if (position < 64) {
|
} else if (position < 64) {
|
||||||
|
1685
src/core/unicode.js
1685
src/core/unicode.js
File diff suppressed because it is too large
Load Diff
@ -745,7 +745,7 @@ class WorkerMessageHandler {
|
|||||||
});
|
});
|
||||||
|
|
||||||
handler.on("GetTextContent", function (data, sink) {
|
handler.on("GetTextContent", function (data, sink) {
|
||||||
const { pageIndex, includeMarkedContent } = data;
|
const { pageIndex, includeMarkedContent, disableNormalization } = data;
|
||||||
|
|
||||||
pdfManager.getPage(pageIndex).then(function (page) {
|
pdfManager.getPage(pageIndex).then(function (page) {
|
||||||
const task = new WorkerTask("GetTextContent: page " + pageIndex);
|
const task = new WorkerTask("GetTextContent: page " + pageIndex);
|
||||||
@ -760,6 +760,7 @@ class WorkerMessageHandler {
|
|||||||
task,
|
task,
|
||||||
sink,
|
sink,
|
||||||
includeMarkedContent,
|
includeMarkedContent,
|
||||||
|
disableNormalization,
|
||||||
})
|
})
|
||||||
.then(
|
.then(
|
||||||
function () {
|
function () {
|
||||||
|
@ -1122,6 +1122,8 @@ class PDFDocumentProxy {
|
|||||||
* @typedef {Object} getTextContentParameters
|
* @typedef {Object} getTextContentParameters
|
||||||
* @property {boolean} [includeMarkedContent] - When true include marked
|
* @property {boolean} [includeMarkedContent] - When true include marked
|
||||||
* content items in the items array of TextContent. The default is `false`.
|
* content items in the items array of TextContent. The default is `false`.
|
||||||
|
* @property {boolean} [disableNormalization] - When true the text is *not*
|
||||||
|
* normalized in the worker-thread. The default is `false`.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1598,7 +1600,10 @@ class PDFPageProxy {
|
|||||||
* @param {getTextContentParameters} params - getTextContent parameters.
|
* @param {getTextContentParameters} params - getTextContent parameters.
|
||||||
* @returns {ReadableStream} Stream for reading text content chunks.
|
* @returns {ReadableStream} Stream for reading text content chunks.
|
||||||
*/
|
*/
|
||||||
streamTextContent({ includeMarkedContent = false } = {}) {
|
streamTextContent({
|
||||||
|
includeMarkedContent = false,
|
||||||
|
disableNormalization = false,
|
||||||
|
} = {}) {
|
||||||
const TEXT_CONTENT_CHUNK_SIZE = 100;
|
const TEXT_CONTENT_CHUNK_SIZE = 100;
|
||||||
|
|
||||||
return this._transport.messageHandler.sendWithStream(
|
return this._transport.messageHandler.sendWithStream(
|
||||||
@ -1606,6 +1611,7 @@ class PDFPageProxy {
|
|||||||
{
|
{
|
||||||
pageIndex: this._pageIndex,
|
pageIndex: this._pageIndex,
|
||||||
includeMarkedContent: includeMarkedContent === true,
|
includeMarkedContent: includeMarkedContent === true,
|
||||||
|
disableNormalization: disableNormalization === true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
highWaterMark: TEXT_CONTENT_CHUNK_SIZE,
|
highWaterMark: TEXT_CONTENT_CHUNK_SIZE,
|
||||||
|
@ -35,6 +35,7 @@ import {
|
|||||||
FeatureTest,
|
FeatureTest,
|
||||||
InvalidPDFException,
|
InvalidPDFException,
|
||||||
MissingPDFException,
|
MissingPDFException,
|
||||||
|
normalizeUnicode,
|
||||||
OPS,
|
OPS,
|
||||||
PasswordResponses,
|
PasswordResponses,
|
||||||
PermissionFlag,
|
PermissionFlag,
|
||||||
@ -100,6 +101,7 @@ export {
|
|||||||
isPdfFile,
|
isPdfFile,
|
||||||
loadScript,
|
loadScript,
|
||||||
MissingPDFException,
|
MissingPDFException,
|
||||||
|
normalizeUnicode,
|
||||||
OPS,
|
OPS,
|
||||||
PasswordResponses,
|
PasswordResponses,
|
||||||
PDFDataRangeTransport,
|
PDFDataRangeTransport,
|
||||||
|
@ -1026,6 +1026,25 @@ function createPromiseCapability() {
|
|||||||
return capability;
|
return capability;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let NormalizeRegex = null;
|
||||||
|
let NormalizationMap = null;
|
||||||
|
function normalizeUnicode(str) {
|
||||||
|
if (!NormalizeRegex) {
|
||||||
|
// In order to generate the following regex:
|
||||||
|
// - create a PDF containing all the chars in the range 0000-FFFF with
|
||||||
|
// a NFKC which is different of the char.
|
||||||
|
// - copy and paste all those chars and get the ones where NFKC is
|
||||||
|
// required.
|
||||||
|
// It appears that most the chars here contain some ligatures.
|
||||||
|
NormalizeRegex =
|
||||||
|
/([\u00a0\u00b5\u037e\u0eb3\u2000-\u200a\u202f\u2126\ufb00-\ufb04\ufb06\ufb20-\ufb36\ufb38-\ufb3c\ufb3e\ufb40-\ufb41\ufb43-\ufb44\ufb46-\ufba1\ufba4-\ufba9\ufbae-\ufbb1\ufbd3-\ufbdc\ufbde-\ufbe7\ufbea-\ufbf8\ufbfc-\ufbfd\ufc00-\ufc5d\ufc64-\ufcf1\ufcf5-\ufd3d\ufd88\ufdf4\ufdfa-\ufdfb\ufe71\ufe77\ufe79\ufe7b\ufe7d]+)|(\ufb05+)/gu;
|
||||||
|
NormalizationMap = new Map([["ſt", "ſt"]]);
|
||||||
|
}
|
||||||
|
return str.replaceAll(NormalizeRegex, (_, p1, p2) => {
|
||||||
|
return p1 ? p1.normalize("NFKC") : NormalizationMap.get(p2);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
export {
|
export {
|
||||||
AbortException,
|
AbortException,
|
||||||
AnnotationActionEventType,
|
AnnotationActionEventType,
|
||||||
@ -1064,6 +1083,7 @@ export {
|
|||||||
LINE_FACTOR,
|
LINE_FACTOR,
|
||||||
MAX_IMAGE_SIZE_TO_CACHE,
|
MAX_IMAGE_SIZE_TO_CACHE,
|
||||||
MissingPDFException,
|
MissingPDFException,
|
||||||
|
normalizeUnicode,
|
||||||
objectFromMap,
|
objectFromMap,
|
||||||
objectSize,
|
objectSize,
|
||||||
OPS,
|
OPS,
|
||||||
|
@ -693,6 +693,7 @@ class Driver {
|
|||||||
initPromise = page
|
initPromise = page
|
||||||
.getTextContent({
|
.getTextContent({
|
||||||
includeMarkedContent: true,
|
includeMarkedContent: true,
|
||||||
|
disableNormalization: true,
|
||||||
})
|
})
|
||||||
.then(function (textContent) {
|
.then(function (textContent) {
|
||||||
return Rasterize.textLayer(
|
return Rasterize.textLayer(
|
||||||
|
@ -28,7 +28,7 @@ describe("Copy and paste", () => {
|
|||||||
await closePages(pages);
|
await closePages(pages);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("must check that we've all the contents", async () => {
|
it("must check that we've all the contents on copy/paste", async () => {
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
pages.map(async ([browserName, page]) => {
|
pages.map(async ([browserName, page]) => {
|
||||||
await page.keyboard.down("Control");
|
await page.keyboard.down("Control");
|
||||||
@ -117,4 +117,47 @@ describe("Copy and paste", () => {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
describe("all text", () => {
|
||||||
|
let pages;
|
||||||
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
pages = await loadAndWait("copy_paste_ligatures.pdf", ".textLayer");
|
||||||
|
await mockClipboard(pages);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(async () => {
|
||||||
|
await closePages(pages);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("must check that we've all the contents on copy/paste", async () => {
|
||||||
|
await Promise.all(
|
||||||
|
pages.map(async ([browserName, page]) => {
|
||||||
|
await page.keyboard.down("Control");
|
||||||
|
await page.keyboard.press("a");
|
||||||
|
await page.keyboard.up("Control");
|
||||||
|
|
||||||
|
await page.waitForTimeout(100);
|
||||||
|
|
||||||
|
await page.keyboard.down("Control");
|
||||||
|
await page.keyboard.press("c");
|
||||||
|
await page.keyboard.up("Control");
|
||||||
|
|
||||||
|
await page.waitForTimeout(100);
|
||||||
|
|
||||||
|
await page.waitForFunction(
|
||||||
|
`document.querySelector('#viewerContainer').style.cursor !== "wait"`
|
||||||
|
);
|
||||||
|
|
||||||
|
const text = await page.evaluate(() =>
|
||||||
|
navigator.clipboard.readText()
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(!!text).withContext(`In ${browserName}`).toEqual(true);
|
||||||
|
expect(text)
|
||||||
|
.withContext(`In ${browserName}`)
|
||||||
|
.toEqual("abcdeffffiflffifflſtstghijklmno");
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -585,3 +585,4 @@
|
|||||||
!issue16221.pdf
|
!issue16221.pdf
|
||||||
!issue16224.pdf
|
!issue16224.pdf
|
||||||
!issue16278.pdf
|
!issue16278.pdf
|
||||||
|
!copy_paste_ligatures.pdf
|
||||||
|
BIN
test/pdfs/copy_paste_ligatures.pdf
Executable file
BIN
test/pdfs/copy_paste_ligatures.pdf
Executable file
Binary file not shown.
@ -2340,7 +2340,9 @@ page 1 / 3`);
|
|||||||
);
|
);
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items, styles } = await pdfPage.getTextContent();
|
const { items, styles } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
expect(items.length).toEqual(1);
|
expect(items.length).toEqual(1);
|
||||||
// Font name will be a random object id.
|
// Font name will be a random object id.
|
||||||
const fontName = items[0].fontName;
|
const fontName = items[0].fontName;
|
||||||
@ -2376,7 +2378,9 @@ page 1 / 3`);
|
|||||||
const loadingTask = getDocument(buildGetDocumentParams("issue13226.pdf"));
|
const loadingTask = getDocument(buildGetDocumentParams("issue13226.pdf"));
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items } = await pdfPage.getTextContent();
|
const { items } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
const text = mergeText(items);
|
const text = mergeText(items);
|
||||||
|
|
||||||
expect(text).toEqual(
|
expect(text).toEqual(
|
||||||
@ -2394,7 +2398,9 @@ page 1 / 3`);
|
|||||||
const loadingTask = getDocument(buildGetDocumentParams("issue16119.pdf"));
|
const loadingTask = getDocument(buildGetDocumentParams("issue16119.pdf"));
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items } = await pdfPage.getTextContent();
|
const { items } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
const text = mergeText(items);
|
const text = mergeText(items);
|
||||||
|
|
||||||
expect(
|
expect(
|
||||||
@ -2410,7 +2416,9 @@ page 1 / 3`);
|
|||||||
const loadingTask = getDocument(buildGetDocumentParams("issue13201.pdf"));
|
const loadingTask = getDocument(buildGetDocumentParams("issue13201.pdf"));
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items } = await pdfPage.getTextContent();
|
const { items } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
const text = mergeText(items);
|
const text = mergeText(items);
|
||||||
|
|
||||||
expect(
|
expect(
|
||||||
@ -2436,7 +2444,9 @@ page 1 / 3`);
|
|||||||
const loadingTask = getDocument(buildGetDocumentParams("issue11913.pdf"));
|
const loadingTask = getDocument(buildGetDocumentParams("issue11913.pdf"));
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items } = await pdfPage.getTextContent();
|
const { items } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
const text = mergeText(items);
|
const text = mergeText(items);
|
||||||
|
|
||||||
expect(
|
expect(
|
||||||
@ -2456,7 +2466,9 @@ page 1 / 3`);
|
|||||||
const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf"));
|
const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf"));
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items } = await pdfPage.getTextContent();
|
const { items } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
const text = mergeText(items);
|
const text = mergeText(items);
|
||||||
|
|
||||||
expect(
|
expect(
|
||||||
@ -2475,11 +2487,27 @@ page 1 / 3`);
|
|||||||
const loadingTask = getDocument(buildGetDocumentParams("issue10640.pdf"));
|
const loadingTask = getDocument(buildGetDocumentParams("issue10640.pdf"));
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items } = await pdfPage.getTextContent();
|
let { items } = await pdfPage.getTextContent({
|
||||||
const text = mergeText(items);
|
disableNormalization: true,
|
||||||
|
});
|
||||||
|
let text = mergeText(items);
|
||||||
|
let expected = `Open Sans is a humanist sans serif typeface designed by Steve Matteson.
|
||||||
|
Open Sans was designed with an upright stress, open forms and a neu-
|
||||||
|
tral, yet friendly appearance. It was optimized for print, web, and mobile
|
||||||
|
interfaces, and has excellent legibility characteristics in its letterforms (see
|
||||||
|
figure \x81 on the following page). This font is available from the Google Font
|
||||||
|
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
|
||||||
|
This package provides support for this font in LATEX. It includes Type \x81
|
||||||
|
versions of the fonts, converted for this package using FontForge from its
|
||||||
|
sources, for full support with Dvips.`;
|
||||||
|
|
||||||
expect(
|
expect(text.includes(expected)).toEqual(true);
|
||||||
text.includes(`Open Sans is a humanist sans serif typeface designed by Steve Matteson.
|
|
||||||
|
({ items } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: false,
|
||||||
|
}));
|
||||||
|
text = mergeText(items);
|
||||||
|
expected = `Open Sans is a humanist sans serif typeface designed by Steve Matteson.
|
||||||
Open Sans was designed with an upright stress, open forms and a neu-
|
Open Sans was designed with an upright stress, open forms and a neu-
|
||||||
tral, yet friendly appearance. It was optimized for print, web, and mobile
|
tral, yet friendly appearance. It was optimized for print, web, and mobile
|
||||||
interfaces, and has excellent legibility characteristics in its letterforms (see
|
interfaces, and has excellent legibility characteristics in its letterforms (see
|
||||||
@ -2487,8 +2515,8 @@ figure \x81 on the following page). This font is available from the Google Font
|
|||||||
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
|
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
|
||||||
This package provides support for this font in LATEX. It includes Type \x81
|
This package provides support for this font in LATEX. It includes Type \x81
|
||||||
versions of the fonts, converted for this package using FontForge from its
|
versions of the fonts, converted for this package using FontForge from its
|
||||||
sources, for full support with Dvips.`)
|
sources, for full support with Dvips.`;
|
||||||
).toEqual(true);
|
expect(text.includes(expected)).toEqual(true);
|
||||||
|
|
||||||
await loadingTask.destroy();
|
await loadingTask.destroy();
|
||||||
});
|
});
|
||||||
@ -2501,7 +2529,9 @@ sources, for full support with Dvips.`)
|
|||||||
const loadingTask = getDocument(buildGetDocumentParams("bug931481.pdf"));
|
const loadingTask = getDocument(buildGetDocumentParams("bug931481.pdf"));
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items } = await pdfPage.getTextContent();
|
const { items } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
const text = mergeText(items);
|
const text = mergeText(items);
|
||||||
|
|
||||||
expect(
|
expect(
|
||||||
@ -2529,7 +2559,9 @@ sozialökonomische Gerechtigkeit.`)
|
|||||||
const loadingTask = getDocument(buildGetDocumentParams("issue9186.pdf"));
|
const loadingTask = getDocument(buildGetDocumentParams("issue9186.pdf"));
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items } = await pdfPage.getTextContent();
|
const { items } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
const text = mergeText(items);
|
const text = mergeText(items);
|
||||||
|
|
||||||
expect(
|
expect(
|
||||||
@ -2550,7 +2582,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
|||||||
);
|
);
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items } = await pdfPage.getTextContent();
|
const { items } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
const text = mergeText(items);
|
const text = mergeText(items);
|
||||||
|
|
||||||
expect(text).toEqual(
|
expect(text).toEqual(
|
||||||
@ -2568,7 +2602,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
|||||||
const loadingTask = getDocument(buildGetDocumentParams("bug1755201.pdf"));
|
const loadingTask = getDocument(buildGetDocumentParams("bug1755201.pdf"));
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(6);
|
const pdfPage = await pdfDoc.getPage(6);
|
||||||
const { items } = await pdfPage.getTextContent();
|
const { items } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
const text = mergeText(items);
|
const text = mergeText(items);
|
||||||
|
|
||||||
expect(/win aisle/.test(text)).toEqual(false);
|
expect(/win aisle/.test(text)).toEqual(false);
|
||||||
@ -2586,10 +2622,12 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
|||||||
const pdfPage = await pdfDoc.getPage(568);
|
const pdfPage = await pdfDoc.getPage(568);
|
||||||
let { items } = await pdfPage.getTextContent({
|
let { items } = await pdfPage.getTextContent({
|
||||||
includeMarkedContent: false,
|
includeMarkedContent: false,
|
||||||
|
disableNormalization: true,
|
||||||
});
|
});
|
||||||
const textWithoutMC = mergeText(items);
|
const textWithoutMC = mergeText(items);
|
||||||
({ items } = await pdfPage.getTextContent({
|
({ items } = await pdfPage.getTextContent({
|
||||||
includeMarkedContent: true,
|
includeMarkedContent: true,
|
||||||
|
disableNormalization: true,
|
||||||
}));
|
}));
|
||||||
const textWithMC = mergeText(items);
|
const textWithMC = mergeText(items);
|
||||||
|
|
||||||
@ -2607,7 +2645,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
|||||||
);
|
);
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items } = await pdfPage.getTextContent();
|
const { items } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
const text = mergeText(items);
|
const text = mergeText(items);
|
||||||
|
|
||||||
expect(text).toEqual("𠮷");
|
expect(text).toEqual("𠮷");
|
||||||
@ -2619,7 +2659,9 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
|
|||||||
const loadingTask = getDocument(buildGetDocumentParams("issue16221.pdf"));
|
const loadingTask = getDocument(buildGetDocumentParams("issue16221.pdf"));
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items } = await pdfPage.getTextContent();
|
const { items } = await pdfPage.getTextContent({
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
|
|
||||||
expect(items.map(i => i.str)).toEqual(["Hello ", "World"]);
|
expect(items.map(i => i.str)).toEqual(["Hello ", "World"]);
|
||||||
|
|
||||||
|
@ -542,7 +542,7 @@ describe("pdf_find_controller", function () {
|
|||||||
pageIndex: 0,
|
pageIndex: 0,
|
||||||
matchIndex: 0,
|
matchIndex: 0,
|
||||||
},
|
},
|
||||||
pageMatches: [[2743]],
|
pageMatches: [[2734]],
|
||||||
pageMatchesLength: [[14]],
|
pageMatchesLength: [[14]],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -561,7 +561,7 @@ describe("pdf_find_controller", function () {
|
|||||||
pageIndex: 1,
|
pageIndex: 1,
|
||||||
matchIndex: 0,
|
matchIndex: 0,
|
||||||
},
|
},
|
||||||
pageMatches: [[], [1493]],
|
pageMatches: [[], [1486]],
|
||||||
pageMatchesLength: [[], [11]],
|
pageMatchesLength: [[], [11]],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -594,7 +594,7 @@ describe("pdf_find_controller", function () {
|
|||||||
[],
|
[],
|
||||||
[],
|
[],
|
||||||
[],
|
[],
|
||||||
[2087],
|
[2081],
|
||||||
],
|
],
|
||||||
pageMatchesLength: [
|
pageMatchesLength: [
|
||||||
[24],
|
[24],
|
||||||
@ -629,7 +629,7 @@ describe("pdf_find_controller", function () {
|
|||||||
pageIndex: 0,
|
pageIndex: 0,
|
||||||
matchIndex: 0,
|
matchIndex: 0,
|
||||||
},
|
},
|
||||||
pageMatches: [[1501]],
|
pageMatches: [[1497]],
|
||||||
pageMatchesLength: [[25]],
|
pageMatchesLength: [[25]],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -670,7 +670,7 @@ describe("pdf_find_controller", function () {
|
|||||||
pageIndex: 0,
|
pageIndex: 0,
|
||||||
matchIndex: 0,
|
matchIndex: 0,
|
||||||
},
|
},
|
||||||
pageMatches: [[1946]],
|
pageMatches: [[1941]],
|
||||||
pageMatchesLength: [[21]],
|
pageMatchesLength: [[21]],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -692,7 +692,7 @@ describe("pdf_find_controller", function () {
|
|||||||
pageIndex: 0,
|
pageIndex: 0,
|
||||||
matchIndex: 0,
|
matchIndex: 0,
|
||||||
},
|
},
|
||||||
pageMatches: [[1946]],
|
pageMatches: [[1941]],
|
||||||
pageMatchesLength: [[23]],
|
pageMatchesLength: [[23]],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -712,7 +712,7 @@ describe("pdf_find_controller", function () {
|
|||||||
pageIndex: 0,
|
pageIndex: 0,
|
||||||
matchIndex: 0,
|
matchIndex: 0,
|
||||||
},
|
},
|
||||||
pageMatches: [[1946]],
|
pageMatches: [[1941]],
|
||||||
pageMatchesLength: [[23]],
|
pageMatchesLength: [[23]],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -976,4 +976,61 @@ describe("pdf_find_controller", function () {
|
|||||||
pageMatchesLength: [[5, 5]],
|
pageMatchesLength: [[5, 5]],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("performs a search in a text with some arabic chars in different unicode ranges but with same normalized form", async function () {
|
||||||
|
const { eventBus, pdfFindController } = await initPdfFindController(
|
||||||
|
"ArabicCIDTrueType.pdf"
|
||||||
|
);
|
||||||
|
|
||||||
|
await testSearch({
|
||||||
|
eventBus,
|
||||||
|
pdfFindController,
|
||||||
|
state: {
|
||||||
|
query: "\u0629",
|
||||||
|
},
|
||||||
|
matchesPerPage: [4],
|
||||||
|
selectedMatch: {
|
||||||
|
pageIndex: 0,
|
||||||
|
matchIndex: 0,
|
||||||
|
},
|
||||||
|
pageMatches: [[6, 25, 44, 63]],
|
||||||
|
pageMatchesLength: [[1, 1, 1, 1]],
|
||||||
|
});
|
||||||
|
|
||||||
|
await testSearch({
|
||||||
|
eventBus,
|
||||||
|
pdfFindController,
|
||||||
|
state: {
|
||||||
|
query: "\ufe94",
|
||||||
|
},
|
||||||
|
matchesPerPage: [4],
|
||||||
|
selectedMatch: {
|
||||||
|
pageIndex: 0,
|
||||||
|
matchIndex: 0,
|
||||||
|
},
|
||||||
|
pageMatches: [[6, 25, 44, 63]],
|
||||||
|
pageMatchesLength: [[1, 1, 1, 1]],
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("performs a search in a text with some f ligatures", async function () {
|
||||||
|
const { eventBus, pdfFindController } = await initPdfFindController(
|
||||||
|
"copy_paste_ligatures.pdf"
|
||||||
|
);
|
||||||
|
|
||||||
|
await testSearch({
|
||||||
|
eventBus,
|
||||||
|
pdfFindController,
|
||||||
|
state: {
|
||||||
|
query: "f",
|
||||||
|
},
|
||||||
|
matchesPerPage: [9],
|
||||||
|
selectedMatch: {
|
||||||
|
pageIndex: 0,
|
||||||
|
matchIndex: 0,
|
||||||
|
},
|
||||||
|
pageMatches: [[5, 6, 6, 7, 8, 9, 9, 10, 10]],
|
||||||
|
pageMatchesLength: [[1, 1, 1, 1, 1, 1, 1, 1, 1]],
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
@ -15,11 +15,9 @@
|
|||||||
|
|
||||||
import {
|
import {
|
||||||
getCharUnicodeCategory,
|
getCharUnicodeCategory,
|
||||||
getNormalizedUnicodes,
|
|
||||||
getUnicodeForGlyph,
|
getUnicodeForGlyph,
|
||||||
getUnicodeRangeFor,
|
getUnicodeRangeFor,
|
||||||
mapSpecialUnicodeValues,
|
mapSpecialUnicodeValues,
|
||||||
reverseIfRtl,
|
|
||||||
} from "../../src/core/unicode.js";
|
} from "../../src/core/unicode.js";
|
||||||
import {
|
import {
|
||||||
getDingbatsGlyphsUnicode,
|
getDingbatsGlyphsUnicode,
|
||||||
@ -152,69 +150,12 @@ describe("unicode", function () {
|
|||||||
expect(getUnicodeRangeFor(0x0041)).toEqual(0);
|
expect(getUnicodeRangeFor(0x0041)).toEqual(0);
|
||||||
// fi (Alphabetic Presentation Forms)
|
// fi (Alphabetic Presentation Forms)
|
||||||
expect(getUnicodeRangeFor(0xfb01)).toEqual(62);
|
expect(getUnicodeRangeFor(0xfb01)).toEqual(62);
|
||||||
|
// Combining diacritic (Cyrillic Extended-A)
|
||||||
|
expect(getUnicodeRangeFor(0x2dff)).toEqual(9);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should not get a Unicode range", function () {
|
it("should not get a Unicode range", function () {
|
||||||
expect(getUnicodeRangeFor(0x05ff)).toEqual(-1);
|
expect(getUnicodeRangeFor(0xaa60)).toEqual(-1);
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
describe("getNormalizedUnicodes", function () {
|
|
||||||
let NormalizedUnicodes;
|
|
||||||
|
|
||||||
beforeAll(function () {
|
|
||||||
NormalizedUnicodes = getNormalizedUnicodes();
|
|
||||||
});
|
|
||||||
|
|
||||||
afterAll(function () {
|
|
||||||
NormalizedUnicodes = null;
|
|
||||||
});
|
|
||||||
|
|
||||||
it("should get normalized Unicode values for ligatures", function () {
|
|
||||||
// fi => f + i
|
|
||||||
expect(NormalizedUnicodes["\uFB01"]).toEqual("fi");
|
|
||||||
// Arabic
|
|
||||||
expect(NormalizedUnicodes["\u0675"]).toEqual("\u0627\u0674");
|
|
||||||
});
|
|
||||||
|
|
||||||
it("should not normalize standard characters", function () {
|
|
||||||
expect(NormalizedUnicodes.A).toEqual(undefined);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
describe("reverseIfRtl", function () {
|
|
||||||
let NormalizedUnicodes;
|
|
||||||
|
|
||||||
function getGlyphUnicode(char) {
|
|
||||||
if (NormalizedUnicodes[char] !== undefined) {
|
|
||||||
return NormalizedUnicodes[char];
|
|
||||||
}
|
|
||||||
return char;
|
|
||||||
}
|
|
||||||
|
|
||||||
beforeAll(function () {
|
|
||||||
NormalizedUnicodes = getNormalizedUnicodes();
|
|
||||||
});
|
|
||||||
|
|
||||||
afterAll(function () {
|
|
||||||
NormalizedUnicodes = null;
|
|
||||||
});
|
|
||||||
|
|
||||||
it("should not reverse LTR characters", function () {
|
|
||||||
const A = getGlyphUnicode("A");
|
|
||||||
expect(reverseIfRtl(A)).toEqual("A");
|
|
||||||
|
|
||||||
const fi = getGlyphUnicode("\uFB01");
|
|
||||||
expect(reverseIfRtl(fi)).toEqual("fi");
|
|
||||||
});
|
|
||||||
|
|
||||||
it("should reverse RTL characters", function () {
|
|
||||||
// Hebrew (no-op, since it's not a combined character)
|
|
||||||
const heAlef = getGlyphUnicode("\u05D0");
|
|
||||||
expect(reverseIfRtl(heAlef)).toEqual("\u05D0");
|
|
||||||
// Arabic
|
|
||||||
const arAlef = getGlyphUnicode("\u0675");
|
|
||||||
expect(reverseIfRtl(arAlef)).toEqual("\u0674\u0627");
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -18,8 +18,8 @@
|
|||||||
/** @typedef {import("./interfaces").IPDFLinkService} IPDFLinkService */
|
/** @typedef {import("./interfaces").IPDFLinkService} IPDFLinkService */
|
||||||
|
|
||||||
import { binarySearchFirstItem, scrollIntoView } from "./ui_utils.js";
|
import { binarySearchFirstItem, scrollIntoView } from "./ui_utils.js";
|
||||||
|
import { getCharacterType, getNormalizeWithNFKC } from "./pdf_find_utils.js";
|
||||||
import { createPromiseCapability } from "pdfjs-lib";
|
import { createPromiseCapability } from "pdfjs-lib";
|
||||||
import { getCharacterType } from "./pdf_find_utils.js";
|
|
||||||
|
|
||||||
const FindState = {
|
const FindState = {
|
||||||
FOUND: 0,
|
FOUND: 0,
|
||||||
@ -126,12 +126,7 @@ function normalize(text) {
|
|||||||
} else {
|
} else {
|
||||||
// Compile the regular expression for text normalization once.
|
// Compile the regular expression for text normalization once.
|
||||||
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
|
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
|
||||||
const toNormalizeWithNFKC =
|
const toNormalizeWithNFKC = getNormalizeWithNFKC();
|
||||||
"\u2460-\u2473" + // Circled numbers.
|
|
||||||
"\u24b6-\u24ff" + // Circled letters/numbers.
|
|
||||||
"\u3244-\u32bf" + // Circled ideograms/numbers.
|
|
||||||
"\u32d0-\u32fe" + // Circled ideograms.
|
|
||||||
"\uff00-\uffef"; // Halfwidth, fullwidth forms.
|
|
||||||
|
|
||||||
// 3040-309F: Hiragana
|
// 3040-309F: Hiragana
|
||||||
// 30A0-30FF: Katakana
|
// 30A0-30FF: Katakana
|
||||||
@ -840,6 +835,7 @@ class PDFFindController {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let promise = Promise.resolve();
|
let promise = Promise.resolve();
|
||||||
|
const textOptions = { disableNormalization: true };
|
||||||
for (let i = 0, ii = this._linkService.pagesCount; i < ii; i++) {
|
for (let i = 0, ii = this._linkService.pagesCount; i < ii; i++) {
|
||||||
const extractTextCapability = createPromiseCapability();
|
const extractTextCapability = createPromiseCapability();
|
||||||
this._extractTextPromises[i] = extractTextCapability.promise;
|
this._extractTextPromises[i] = extractTextCapability.promise;
|
||||||
@ -848,7 +844,7 @@ class PDFFindController {
|
|||||||
return this._pdfDocument
|
return this._pdfDocument
|
||||||
.getPage(i + 1)
|
.getPage(i + 1)
|
||||||
.then(pdfPage => {
|
.then(pdfPage => {
|
||||||
return pdfPage.getTextContent();
|
return pdfPage.getTextContent(textOptions);
|
||||||
})
|
})
|
||||||
.then(
|
.then(
|
||||||
textContent => {
|
textContent => {
|
||||||
|
@ -112,4 +112,46 @@ function getCharacterType(charCode) {
|
|||||||
return CharacterType.ALPHA_LETTER;
|
return CharacterType.ALPHA_LETTER;
|
||||||
}
|
}
|
||||||
|
|
||||||
export { CharacterType, getCharacterType };
|
let NormalizeWithNFKC;
|
||||||
|
function getNormalizeWithNFKC() {
|
||||||
|
/* eslint-disable no-irregular-whitespace */
|
||||||
|
NormalizeWithNFKC ||= ` ¨ª¯²-µ¸-º¼-¾IJ-ijĿ-ŀʼnſDŽ-njDZ-dzʰ-ʸ˘-˝ˠ-ˤʹͺ;΄-΅·ϐ-ϖϰ-ϲϴ-ϵϹևٵ-ٸक़-य़ড়-ঢ়য়ਲ਼ਸ਼ਖ਼-ਜ਼ਫ਼ଡ଼-ଢ଼ำຳໜ-ໝ༌གྷཌྷདྷབྷཛྷཀྵჼᴬ-ᴮᴰ-ᴺᴼ-ᵍᵏ-ᵪᵸᶛ-ᶿẚ-ẛάέήίόύώΆ᾽-῁ΈΉ῍-῏ΐΊ῝-῟ΰΎ῭-`ΌΏ´-῾ - ‑‗․-… ″-‴‶-‷‼‾⁇-⁉⁗ ⁰-ⁱ⁴-₎ₐ-ₜ₨℀-℃℅-ℇ℉-ℓℕ-№ℙ-ℝ℠-™ℤΩℨK-ℭℯ-ℱℳ-ℹ℻-⅀ⅅ-ⅉ⅐-ⅿ↉∬-∭∯-∰〈-〉①-⓪⨌⩴-⩶⫝̸ⱼ-ⱽⵯ⺟⻳⼀-⿕ 〶〸-〺゛-゜ゟヿㄱ-ㆎ㆒-㆟㈀-㈞㈠-㉇㉐-㉾㊀-㏿ꚜ-ꚝꝰꟲ-ꟴꟸ-ꟹꭜ-ꭟꭩ豈-嗀塚晴凞-羽蘒諸逸-都飯-舘並-龎ff-stﬓ-ﬗיִײַ-זּטּ-לּמּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-﷼︐-︙︰-﹄﹇-﹒﹔-﹦﹨-﹫ﹰ-ﹲﹴﹶ-ﻼ!-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ¢-₩`;
|
||||||
|
|
||||||
|
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
|
||||||
|
const ranges = [];
|
||||||
|
const range = [];
|
||||||
|
const diacriticsRegex = /^\p{M}$/u;
|
||||||
|
// Some chars must be replaced by their NFKC counterpart during a search.
|
||||||
|
for (let i = 0; i < 65536; i++) {
|
||||||
|
const c = String.fromCharCode(i);
|
||||||
|
if (c.normalize("NFKC") !== c && !diacriticsRegex.test(c)) {
|
||||||
|
if (range.length !== 2) {
|
||||||
|
range[0] = range[1] = i;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (range[1] + 1 !== i) {
|
||||||
|
if (range[0] === range[1]) {
|
||||||
|
ranges.push(String.fromCharCode(range[0]));
|
||||||
|
} else {
|
||||||
|
ranges.push(
|
||||||
|
`${String.fromCharCode(range[0])}-${String.fromCharCode(
|
||||||
|
range[1]
|
||||||
|
)}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
range[0] = range[1] = i;
|
||||||
|
} else {
|
||||||
|
range[1] = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (ranges.join("") !== NormalizeWithNFKC) {
|
||||||
|
throw new Error(
|
||||||
|
"getNormalizeWithNFKC - update the `NormalizeWithNFKC` string."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NormalizeWithNFKC;
|
||||||
|
}
|
||||||
|
|
||||||
|
export { CharacterType, getCharacterType, getNormalizeWithNFKC };
|
||||||
|
@ -368,6 +368,7 @@ class PDFPageView {
|
|||||||
if (!textLayer.renderingDone) {
|
if (!textLayer.renderingDone) {
|
||||||
const readableStream = pdfPage.streamTextContent({
|
const readableStream = pdfPage.streamTextContent({
|
||||||
includeMarkedContent: true,
|
includeMarkedContent: true,
|
||||||
|
disableNormalization: true,
|
||||||
});
|
});
|
||||||
textLayer.setTextContentSource(readableStream);
|
textLayer.setTextContentSource(readableStream);
|
||||||
}
|
}
|
||||||
|
@ -665,6 +665,8 @@ class PDFViewer {
|
|||||||
}
|
}
|
||||||
buffer.length = 0;
|
buffer.length = 0;
|
||||||
const page = await this.pdfDocument.getPage(pageNum);
|
const page = await this.pdfDocument.getPage(pageNum);
|
||||||
|
// By default getTextContent pass disableNormalization equals to false
|
||||||
|
// which is fine because we want a normalized string.
|
||||||
const { items } = await page.getTextContent();
|
const { items } = await page.getTextContent();
|
||||||
for (const item of items) {
|
for (const item of items) {
|
||||||
if (item.str) {
|
if (item.str) {
|
||||||
|
@ -208,9 +208,20 @@ class TextHighlighter {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let lastDivIdx = -1;
|
||||||
|
let lastOffset = -1;
|
||||||
for (let i = i0; i < i1; i++) {
|
for (let i = i0; i < i1; i++) {
|
||||||
const match = matches[i];
|
const match = matches[i];
|
||||||
const begin = match.begin;
|
const begin = match.begin;
|
||||||
|
if (begin.divIdx === lastDivIdx && begin.offset === lastOffset) {
|
||||||
|
// It's possible to be in this situation if we searched for a 'f' and we
|
||||||
|
// have a ligature 'ff' in the text. The 'ff' has to be highlighted two
|
||||||
|
// times.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
lastDivIdx = begin.divIdx;
|
||||||
|
lastOffset = begin.offset;
|
||||||
|
|
||||||
const end = match.end;
|
const end = match.end;
|
||||||
const isSelected = isSelectedPage && i === selectedMatchIdx;
|
const isSelected = isSelectedPage && i === selectedMatchIdx;
|
||||||
const highlightSuffix = isSelected ? " selected" : "";
|
const highlightSuffix = isSelected ? " selected" : "";
|
||||||
|
@ -20,7 +20,8 @@
|
|||||||
// eslint-disable-next-line max-len
|
// eslint-disable-next-line max-len
|
||||||
/** @typedef {import("./text_accessibility.js").TextAccessibilityManager} TextAccessibilityManager */
|
/** @typedef {import("./text_accessibility.js").TextAccessibilityManager} TextAccessibilityManager */
|
||||||
|
|
||||||
import { renderTextLayer, updateTextLayer } from "pdfjs-lib";
|
import { normalizeUnicode, renderTextLayer, updateTextLayer } from "pdfjs-lib";
|
||||||
|
import { removeNullCharacters } from "./ui_utils.js";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @typedef {Object} TextLayerBuilderOptions
|
* @typedef {Object} TextLayerBuilderOptions
|
||||||
@ -212,6 +213,16 @@ class TextLayerBuilder {
|
|||||||
}
|
}
|
||||||
end.classList.remove("active");
|
end.classList.remove("active");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
div.addEventListener("copy", event => {
|
||||||
|
const selection = document.getSelection();
|
||||||
|
event.clipboardData.setData(
|
||||||
|
"text/plain",
|
||||||
|
removeNullCharacters(normalizeUnicode(selection.toString()))
|
||||||
|
);
|
||||||
|
event.preventDefault();
|
||||||
|
event.stopPropagation();
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user