diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java index 5f43185c6..ad1e4d13a 100644 --- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -54,6 +54,157 @@ public class SrtFromTtmlWriter { out.write(text.getBytes(charset)); } + /** + * Decode XML or HTML entities into their actual (literal) characters. + * + * TTML is XML-based, so text nodes may contain escaped entities + * instead of direct characters. For example: + * + * "&" → "&" + * "<" → "<" + * ">" → ">" + * " " → "\t" (TAB) + * " " ( ) → "\n" (LINE FEED) + * + * XML files cannot contain characters like "<", ">", "&" directly, + * so they must be represented using their entity-encoded forms. + * + * Jsoup sometimes leaves nested or encoded entities unresolved + * (e.g. inside
text nodes in TTML files), so this function
+ * acts as a final “safety net” to ensure all entities are decoded
+ * before further normalization.
+ *
+ * Character representation layers for reference:
+ * - Literal characters: <, >, &
+ * → appear in runtime/output text (e.g. final SRT output)
+ * - Escaped entities: <, >, &
+ * → appear in XML/HTML/TTML source files
+ * - Numeric entities: , ,
+ * → appear mainly in XML/TTML files (also valid in HTML)
+ * for non-printable or special characters
+ * - Unicode escapes: \u00A0 (Java/Unicode internal form)
+ * → appear only in Java source code (NOT valid in XML)
+ *
+ * XML entities include both named (&, <) and numeric
+ * ( , ) forms.
+ *
+ * @param encodedEntities The raw text fragment possibly containing
+ * encoded XML entities.
+ * @return A decoded string where all entities are replaced by their
+ * actual (literal) characters.
+ */
+ private String decodeXmlEntities(final String encodedEntities) {
+ final String decoded = Parser.unescapeEntities(encodedEntities, true);
+ return decoded;
+ }
+
+ /**
+ * Handle rare XML entity characters like LF:
(`\n`)
+ * , CR:
(`\r`) and CRLF: (`\r\n`).
+ *
+ * These are technically valid in TTML (XML allows them)
+ * but unusual in practice, since most TTML line breaks
+ * are represented as
tags instead.
+ * As a defensive approach, we normalize them:
+ *
+ * - Windows (\r\n), macOS (\r), and Unix (\n) → unified SRT NEW_LINE (\r\n)
+ *
+ * Although well-formed TTML normally encodes line breaks
+ * as
tags, some auto-generated or malformed TTML files
+ * may embed literal newline entities (
,
). This
+ * normalization ensures these cases render properly in SRT
+ * players instead of breaking the subtitle structure.
+ *
+ * @param text To be normalized text with actual characters.
+ * @return Unified SRT NEW_LINE converted from all kinds of line breaks.
+ */
+ private String normalizeLineBreakForSrt(final String text) {
+ String cleaned = text;
+
+ // NOTE:
+ // The order of newline replacements must NOT change,
+ // or duplicated line breaks (e.g. \r\n → \n\n) will occur.
+ cleaned = cleaned.replace("\r\n", "\n")
+ .replace("\r", "\n");
+
+ cleaned = cleaned.replace("\n", NEW_LINE);
+
+ return cleaned;
+ }
+
+ private String normalizeForSrt(final String actualText) {
+ String cleaned = actualText;
+
+ // Replace non-breaking space (\u00A0) with regular space ' '(\u0020).
+ // - YouTube TTML subtitles use both regular spaces (\u0020)
+ // and non-breaking spaces (\u00A0).
+ // - SRT subtitles only support regular spaces (\u0020),
+ // so \u00A0 may cause display issues.
+ // - \u00A0 and \u0020 are visually identical (i.e., they both
+ // appear as spaces ' '), but they differ in Unicode encoding,
+ // leading to test failures (e.g., ComparisonFailure).
+ // - Convert \u00A0 to \u0020 to ensure consistency in subtitle
+ // formatting.
+ // - References:
+ // - Unicode General Punctuation: https://unicode.org/charts/PDF/U2000.pdf
+ // - TTML Spec: https://www.w3.org/TR/ttml2/
+ // - SRT Format: https://en.wikipedia.org/wiki/SubRip
+ cleaned = cleaned.replace('\u00A0', ' ') // Non-breaking space
+ .replace('\u202F', ' ') // Narrow no-break space
+ .replace('\u205F', ' ') // Medium mathematical space
+ .replace('\u3000', ' ') // Ideographic space
+ // \u2000 ~ \u200A are whitespace characters (e.g.,
+ // en space, em space), replaced with regular space (\u0020).
+ .replaceAll("[\\u2000-\\u200A]", " "); // Whitespace characters
+
+ // \u200B ~ \u200F are a range of non-spacing characters
+ // (e.g., zero-width space, zero-width non-joiner, etc.),
+ // which have no effect in *.SRT files and may cause
+ // display issues.
+ // These characters are invisible to the human eye, and
+ // they still exist in the encoding, so they need to be
+ // removed.
+ // After removal, the actual content becomes completely
+ // empty "", meaning there are no characters left, just
+ // an empty space, which helps avoid formatting issues
+ // in subtitles.
+ cleaned = cleaned.replaceAll("[\\u200B-\\u200F]", ""); // Non-spacing characters
+
+ // Remove control characters (\u0000 ~ \u001F, except
+ // \n, \r, \t).
+ // - These are ASCII C0 control codes (e.g. \u0001 SOH,
+ // \u0008 BS, \u001F US), invisible and irrelevant in
+ // subtitles, may cause square boxes (?) in players.
+ // - Reference:
+ // Unicode Basic Latin (https://unicode.org/charts/PDF/U0000.pdf)
+ // ASCII Control (https://en.wikipedia.org/wiki/ASCII#Control_characters)
+ cleaned = cleaned.replaceAll("[\\u0000-\\u0008\\u000B\\u000C\\u000E-\\u001F]", "");
+
+ // Reasoning:
+ // - subtitle files generally don't require tabs for alignment.
+ // - Tabs can be displayed with varying widths across different
+ // editors or platforms, which may cause display issues.
+ // - Replace it with a single space for consistent display
+ // across different editors or platforms.
+ cleaned = cleaned.replace('\t', ' ');
+
+ cleaned = normalizeLineBreakForSrt(cleaned);
+
+ return cleaned;
+ }
+
+ private String sanitizeFragment(final String raw) {
+ if (null == raw) {
+ return "";
+ }
+
+ final String actualCharacters = decodeXmlEntities(raw);
+
+ final String srtSafeText = normalizeForSrt(actualCharacters);
+
+ return srtSafeText;
+ }
+
// CHECKSTYLE:OFF checkstyle:JavadocStyle
// checkstyle does not understand that span tags are inside a code block
/**
@@ -67,9 +218,25 @@ public class SrtFromTtmlWriter {
* @param node the current node to process
* @param text the {@link StringBuilder} to append the extracted text to
*/
+ // --------------------------------------------------------------------
+ // [INTERNAL NOTE] TTML text layer explanation
+ //
+ // TTML parsing involves multiple text "layers":
+ // 1. Raw XML entities (e.g., <, ) are decoded by Jsoup.
+ // 2. extractText() works on DOM TextNodes (already parsed strings).
+ // 3. sanitizeFragment() decodes remaining entities and fixes
+ // Unicode quirks.
+ // 4. normalizeForSrt() ensures literal text is safe for SRT output.
+ //
+ // In short:
+ // Jsoup handles XML-level syntax,
+ // our code handles text-level normalization for subtitles.
+ // --------------------------------------------------------------------
private void extractText(final Node node, final StringBuilder text) {
if (node instanceof TextNode textNode) {
- text.append((textNode).text());
+ String rawTtmlFragment = textNode.getWholeText();
+ String srtContent = sanitizeFragment(rawTtmlFragment);
+ text.append(srtContent);
} else if (node instanceof Element element) {
//
is a self-closing HTML tag used to insert a line break.
if (element.tagName().equalsIgnoreCase("br")) {
diff --git a/app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java b/app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java
new file mode 100644
index 000000000..755724f68
--- /dev/null
+++ b/app/src/test/java/org/schabi/newpipe/streams/SrtFromTtmlWriterTest.java
@@ -0,0 +1,320 @@
+package org.schabi.newpipe.streams;
+
+import org.junit.Test;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.parser.Parser;
+import java.io.ByteArrayInputStream;
+import java.lang.reflect.Method;
+import java.nio.charset.StandardCharsets;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Unit tests for {@link SrtFromTtmlWriter}.
+ *
+ * Tests focus on {@code extractText()} and its handling of TTML
elements.
+ * Note:
+ * - Uses reflection to call the private {@code extractText()} method.
+ * - Update {@code EXTRACT_TEXT_METHOD} if renamed.
+ *
+ * ---
+ * NOTE ABOUT ENTITIES VS UNICODE ESCAPES
+ *
+ * - In short:
+ * * UNICODE ESCAPES → used in Java source (e.g. SrtFromTtmlWriter.java)
+ * * ENTITIES → used in TTML strings (this test file)
+ *
+ * - TTML is an XML-based format. Real TTML subtitles often encode special
+ * characters as XML entities (named or numeric), e.g.:
+ * & → '&' (\u0026)
+ * < → '<' (\u003C)
+ * → tab (\u0009)
+ *
→ line feed (\u000A)
+ *
→ carriage return (\u000D)
+ *
+ * - Java source code uses **Unicode escapes** (e.g. "\u00A0") which are resolved
+ * at compile time, so they do not represent real XML entities.
+ *
+ * - Purpose of these tests:
+ * We simulate *real TTML input* as NewPipe receives it — i.e., strings that
+ * still contain encoded XML entities ( ,
,
, etc.).
+ * The production code (`decodeXmlEntities()`) must convert these into their
+ * actual Unicode characters before normalization.
+ */
+public class SrtFromTtmlWriterTest {
+ private static final String TTML_WRAPPER_START = "
without nested tags. + *
Hello World!
+ */ + private static final String SIMPLE_TTML = "Hello World!
"; + /** + * TTML example with nested tags withHello
World!
"
+ + "Hello
World!
" + + "<tag> & "text"''''" + + " " + + "
"; + /** + * TTML example with special characters: + * - Spaces appear at the beginning and end of the text. + * - Spaces are also present within the text (not just at the edges). + * - The text includes various HTML entities such as , + * &, <, >, etc. + * → non-breaking space (Unicode: '\u00A0', Entity: ' ') + */ + private static final String SPECIAL_TTML = "" + + " ~~-Hello &&<<>>World!! " + + "
"; + + /** + * TTML example with characters: tab. + * → \t + * They are separated by '+' for clarity. + */ + private static final String TAB_TTML = "" + + " + + " + + "
"; + + /** + * TTML example with line endings. + * → \r + */ + private static final String LINE_ENDING_0_TTML = "" + + " + + " + + "
"; + // → \n + private static final String LINE_ENDING_1_TTML = "" + + " + + " + + "
"; + private static final String LINE_ENDING_2_TTML = + "" + + " + + " + + "
"; + + /** + * TTML example with control characters. + * For example: + * → \u0001 + * → \u001F + * + * These control characters, if included as raw Unicode(e.g. '\u0001'), + * are either invalid in XML or rendered as '?' when processed. + * To avoid issues, they should be encoded(e.g. '') in TTML file. + * + * - Reference: + * Unicode Basic Latin (https://unicode.org/charts/PDF/U0000.pdf), + * ASCII Control (https://en.wikipedia.org/wiki/ASCII#Control_characters). + * and the defination of these characters can be known. + */ + private static final String CONTROL_CHAR_TTML = "" + + "+++++" + + "
"; + + + + private static final String EMPTY_TTML = "" + + "" + + "
"; + + /** + * TTML example with Unicode space characters. + * These characters are encoded using character references + * (XXXX;). + * + * Includes: + * ( ) '\u202F' → Narrow no-break space + * ( ) '\u205F' → Medium mathematical space + * ( ) '\u3000' → Ideographic space + * '\u2000' ~ '\u200A' are whitespace characters: + * ( ) '\u2000' → En quad + * ( ) '\u2002' → En space + * ( ) '\u200A' → Hair space + * + * Each character is separated by '+' for clarity. + */ + private static final String UNICODE_SPACE_TTML = "" + + " + + + + + " + + "
"; + + /** + * TTML example with non-spacing (invisible) characters. + * These are encoded using character references (XXXX;). + * + * Includes: + * ()'\u200B' → Zero-width space (ZWSP) + * ()'\u200E' → Left-to-right mark (LRM) + * ()'\u200F' → Right-to-left mark (RLM) + * + * They don't display any characters to the human eye. + * '+' is used between them for clarity in test output. + */ + private static final String NON_SPACING_TTML = "" + + "++" + + "
"; + + /** + * Parses TTML string into a JSoup Document and selects the firstelement. + * + * @param ttmlContent TTML content (e.g.,
...
) + * @return the firstelement + * @throws Exception if parsing or reflection fails + */ + private Element parseTtmlParagraph(final String ttmlContent) throws Exception { + final String ttml = TTML_WRAPPER_START + ttmlContent + TTML_WRAPPER_END; + final Document doc = Jsoup.parse( + new ByteArrayInputStream(ttml.getBytes(StandardCharsets.UTF_8)), + "UTF-8", "", Parser.xmlParser()); + return doc.select("body > div > p").first(); + } + + /** + * Invokes private extractText method via reflection. + * + * @param writer SrtFromTtmlWriter instance + * @param paragraph
element to extract text from
+ * @param text StringBuilder to store extracted text
+ * @throws Exception if reflection fails
+ */
+ private void invokeExtractText(final SrtFromTtmlWriter writer, final Element paragraph,
+ final StringBuilder text) throws Exception {
+ final Method method = writer.getClass()
+ .getDeclaredMethod(EXTRACT_TEXT_METHOD, Node.class, StringBuilder.class);
+ method.setAccessible(true);
+ method.invoke(writer, paragraph, text);
+ }
+
+ private String extractTextFromTtml(final String ttmlInput) throws Exception {
+ final Element paragraph = parseTtmlParagraph(ttmlInput);
+ final StringBuilder text = new StringBuilder();
+ final SrtFromTtmlWriter writer = new SrtFromTtmlWriter(null, false);
+ invokeExtractText(writer, paragraph, text);
+
+ final String actualText = text.toString();
+ return actualText;
+ }
+
+ @Test
+ public void testExtractTextSimpleParagraph() throws Exception {
+ final String expected = "Hello World!";
+ final String actual = extractTextFromTtml(SIMPLE_TTML);
+ assertEquals(expected, actual);
+ }
+
+ @Test
+ public void testExtractTextNestedTags() throws Exception {
+ final String expected = "Hello\r\nWorld!";
+ final String actual = extractTextFromTtml(NESTED_TTML);
+ assertEquals(expected, actual);
+ }
+
+ @Test
+ public void testExtractTextWithEntity() throws Exception {
+ final String expected = "