Tweak the Bidi-detection heuristics for very short RTL strings (issue 11656)

Very short strings can narrowly miss the existing Bidi-detection threshold, leading to incorrect text-selection and copying behaviour.

In my testing, neither Adobe Reader or PDFium seem to handle copying "correctly" for this document. Hence it's not entirely clear to me that we actually want to fix this, since tweaking these heuristics can *obviously* cause regressions elsewhere (and our test coverage for RTL-text isn't exactly great).
This commit is contained in:
Jonas Jenwald 2021-10-31 17:46:42 +01:00
parent 6a15973a1b
commit 5f77d3719b
5 changed files with 44 additions and 2 deletions

View File

@ -158,7 +158,8 @@ function bidi(str, startLevel = -1, vertical = false) {
// Detect the bidi method
// - If there are no rtl characters then no bidi needed
// - If less than 30% chars are rtl then string is primarily ltr
// - If less than 30% chars are rtl then string is primarily ltr,
// unless the string is very short.
// - If more than 30% chars are rtl then string is primarily rtl
if (numBidi === 0) {
isLTR = true;
@ -166,7 +167,7 @@ function bidi(str, startLevel = -1, vertical = false) {
}
if (startLevel === -1) {
if (numBidi / strLength < 0.3) {
if (numBidi / strLength < 0.3 && strLength > 4) {
isLTR = true;
startLevel = 0;
} else {

View File

@ -448,6 +448,7 @@
!annotation-square-circle-without-appearance.pdf
!annotation-stamp.pdf
!issue14048.pdf
!issue11656.pdf
!annotation-fileattachment.pdf
!annotation-text-widget.pdf
!annotation-choice-widget.pdf

BIN
test/pdfs/issue11656.pdf Normal file

Binary file not shown.

View File

@ -5080,6 +5080,12 @@
"lastPage": 1,
"type": "eq"
},
{ "id": "issue11656",
"file": "pdfs/issue11656.pdf",
"md5": "82d5d4f5978a4974707deb1ea98e62f2",
"rounds": 1,
"type": "text"
},
{ "id": "vertical",
"file": "pdfs/vertical.pdf",
"md5": "8a74d33504701edcefeef2afd022765e",

View File

@ -16,6 +16,28 @@
import { bidi } from "../../src/core/bidi.js";
describe("bidi", function () {
it(
"should mark text as LTR if there's only LTR-characters, " +
"when the string is very short",
function () {
const str = "foo";
const bidiText = bidi(str, -1, false);
expect(bidiText.str).toEqual("foo");
expect(bidiText.dir).toEqual("ltr");
}
);
it("should mark text as LTR if there's only LTR-characters", function () {
const str = "Lorem ipsum dolor sit amet, consectetur adipisicing elit.";
const bidiText = bidi(str, -1, false);
expect(bidiText.str).toEqual(
"Lorem ipsum dolor sit amet, consectetur adipisicing elit."
);
expect(bidiText.dir).toEqual("ltr");
});
it("should mark text as RTL if more than 30% of text is RTL", function () {
// 33% of test text are RTL characters
const test = "\u0645\u0635\u0631 Egypt";
@ -34,4 +56,16 @@ describe("bidi", function () {
expect(bidiText.str).toEqual(result);
expect(bidiText.dir).toEqual("ltr");
});
it(
"should mark text as RTL if less than 30% of text is RTL, " +
"when the string is very short (issue 11656)",
function () {
const str = "()\u05d1("; // 25% of the string is RTL characters.
const bidiText = bidi(str, -1, false);
expect(bidiText.str).toEqual("(\u05d1)(");
expect(bidiText.dir).toEqual("rtl");
}
);
});