Context Navigation

← Previous Change
Next Change →

Changeset 688 for trunk

Timestamp:

01/13/14 15:43:42 (10 years ago)

Author:

ngw8

Message:

Improvements to WebParser's handling of blank text nodes, fixing bug where tables broke on conversion

File:

: 1 edited

trunk/src/org/expeditee/io/WebParser.java (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/src/org/expeditee/io/WebParser.java

-              r672
+              r688
                                                 // Functions to be used later in JavaScript
+                                                JavaFX.WebEngineExecuteScript
+                                                                .invoke(webEngine,
+                                                                                "function isContent(el) { "
+                                                                                + "             if (el.tagName === 'P') {"
+                                                                                + "                     return true;"
+                                                                                + "             }"
+                                                                                + "             var text = el.textContent.replace(/(^\\s*)|(\\s*$)/, '');"
+                                                                                + "             var w = text.split(/\\S\\s\\S/).length; var punct = text.split(/\\.|\\,|\\!/).length;"
+                                                                                + "             if (punct < 2 || w < 10) {"
+                                                                                + "                     return false;"
+                                                                                + "             } else { "
+                                                                                + "                     return true;"
+                                                                                + "             }"
+                                                                                + "}"
+                                                JavaFX.WebEngineExecuteScript.invoke(webEngine, ""
                                                                                 + "function addToSpan(text) {"
                                                                                 + "             span = document.createElement('wordSpan');"
                                                                                 + "             span.textContent = text;"
                                                                                 + "             textNode.parentElement.insertBefore(span, textNode);"
+                                                                                + "             par.insertBefore(span, refNode);"
                                                                                 + "             if (prevSpan !== null && span.getBoundingClientRect().top > prevSpan.getBoundingClientRect().top) {"
                                                                                 + "                     span.textContent = '\\n' + span.textContent;"
                                                                                 + "                     if ( prevPrevSpan !== null && prevPrevSpan.getBoundingClientRect().left == prevSpan.getBoundingClientRect().left) {"
                                                                                 + "                             prevPrevSpan.textContent = prevPrevSpan.textContent + prevSpan.textContent;"
                                                                                 + "                             textNode.parentElement.removeChild(prevSpan);"
+                                                                                + "                             prevPrevSpan.textContent = prevPrevSpan.textContent + prevSpan.textContent;"
+                                                                                + "                             par.removeChild(prevSpan);"
                                                                                 + "                     } else {"
                                                                                 + "                             prevPrevSpan = prevSpan;"
                                                                                 + "                     }"
+                                                                                + "                             prevPrevSpan = prevSpan;"
+                                                                                + "                     }"
                                                                                 + "                     prevSpan = span;"
                                                                                 + "             } else if ( prevSpan !== null) {"
                                                                                 + "                     prevSpan.textContent = prevSpan.textContent + span.textContent;"
                                                                                 + "                     textNode.parentElement.removeChild(span);"
                                                                                 + "             } else {"
                                                                                 + "                     prevSpan = span;"
+                                                                                + "                     prevSpan.textContent = prevSpan.textContent + span.textContent;"
+                                                                                + "                     par.removeChild(span);"
+                                                                                + "             } else {"
+                                                                                + "                     prevSpan = span;"
                                                                                 + "             }"
                                                                                 + "}"
 …
                                                 for (int i = 0; i < contentElementsLength; i++) {
                                                         // Getting the current HTML element, then making it accessable in JavaScript
+                                                        System.out.println(i + "/" + contentElementsLength);
+                                                        // Getting the current HTML element, then making it accessible in JavaScript
                                                         Element currentElement = (Element) JavaFX.JSObjectGetSlot.invoke(contentElements, i);
                                                         JavaFX.JSObjectSetMember.invoke(window, "para", currentElement);
-                                                        if ((boolean) (JavaFX.WebEngineExecuteScript.invoke(webEngine, "isContent(para)"))) {
+                                                                JavaFX.WebEngineExecuteScript.invoke(webEngine, "para.style.wordBreak = 'normal';");
+                                                                // Creating a TreeWalker that is used to loop over all the TextNodes within the current paragraph
+                                                                JavaFX.WebEngineExecuteScript.invoke(webEngine, "var walker = document.createTreeWalker(para, NodeFilter.SHOW_TEXT, null, false);");
+                                                                Node textNode;
+                                                        JavaFX.WebEngineExecuteScript.invoke(webEngine, "para.style.wordBreak = 'normal';");
+                                                        // Creating a TreeWalker that is used to loop over all the TextNodes within the current paragraph
+                                                        JavaFX.WebEngineExecuteScript.invoke(webEngine, "var walker = document.createTreeWalker(para, NodeFilter.SHOW_TEXT, null, false);");
+                                                        // Using Javascript to get an array of all the text nodes in the current node. Have to loop through twice (once
+                                                        // to
+                                                        // build the array and once actually going through the array, otherwise when the textnode is removed from
+                                                        // the document items end up being skipped)
+                                                        Object textNodes = JavaFX.WebEngineExecuteScript.invoke(webEngine, "function getTextNodes(rootNode){" + "var node;" + "var textNodes=[];"
+                                                                        + "var walk = document.createTreeWalker(rootNode, NodeFilter.SHOW_TEXT);" + "while(node=walk.nextNode()) {" + "if((node.textContent.trim().length > 0)) { "
+                                                                        + "textNodes.push(node);" + "}" + "}" + "return textNodes;" + "}; " + "getTextNodes(para)");
+                                                        int nodesLength = (Integer) JavaFX.JSObjectGetMember.invoke(textNodes, "length");
+                                                        // Looping through all the text nodes in the current paragraph
+                                                        for (int j = 0; j < nodesLength; j++) {
+                                                                Node currentNode = (Node) JavaFX.JSObjectGetSlot.invoke(textNodes, j);
+                                                                // Making the current node accessible in JavaScript
+                                                                JavaFX.JSObjectSetMember.invoke(window, "textNode", currentNode);
+                                                                JavaFX.WebEngineExecuteScript.invoke(webEngine, ""
+                                                                                + "var span = null;"
+                                                                                + "var prevSpan = null;"
+                                                                                + "var prevPrevSpan = null;"
+                                                                                );
+                                                                // Splitting the text node's content into individual words
+                                                                String textContent = ((String) JavaFX.WebEngineExecuteScript.invoke(webEngine, "textNode.textContent")).replaceAll("\\n|\\r", "").replaceAll("\\s+", " ");
+                                                                String[] words = splitIntoWords(textContent);
+                                                                JavaFX.WebEngineExecuteScript.invoke(webEngine, ""
+                                                                                + "var refNode = textNode.nextSibling;"
+                                                                                + "var par = textNode.parentElement;"
+                                                                                + "textNode.parentElement.removeChild(textNode)");
+                                                                // Adding each word back to the page
+                                                                for (int k = 0; k < words.length; k++) {
+                                                                        Object currentWord = words[k];
+                                                                        JavaFX.JSObjectCall.invoke(window, "addToSpan", new Object[] { currentWord });
+                                                                }
+                                                                // Looping through all the text nodes in the current paragraph
+                                                                while ((textNode = (Node) JavaFX.WebEngineExecuteScript.invoke(webEngine, "walker.nextNode()")) != null) {
+                                                                        // Making the current node accessible in JavaScript
+                                                                        JavaFX.JSObjectSetMember.invoke(window, "textNode", textNode);
+                                                                        JavaFX.WebEngineExecuteScript.invoke(webEngine, ""
+                                                                                        + "var span = null;"
+                                                                                        + "var prevSpan = null;"
+                                                                                        + "var prevPrevSpan = null;"
+                                                                                        );
+                                                                        // Splitting the text node's content into individual words
+                                                                        String textContent = (String) JavaFX.WebEngineExecuteScript.invoke(webEngine, "textNode.textContent");
+                                                                        String[] words = splitIntoWords(textContent);
+                                                                        // Clearing all text from the current text node (but not removing it, as it is needed as a reference
+                                                                        // point for adding back the words)
+                                                                        JavaFX.WebEngineExecuteScript.invoke(webEngine, "textNode.textContent = '';");
+                                                                        // Adding each word back to the page
+                                                                        for (int j = 0; j < words.length; j++) {
+                                                                                Object currentWord = words[j];
+                                                                                JavaFX.JSObjectCall.invoke(window, "addToSpan", new Object[] { currentWord });
+                                                                        }
+                                                                        JavaFX.WebEngineExecuteScript.invoke(webEngine, ""
+                                                                                        + "                     if (prevPrevSpan !== null && prevPrevSpan.getBoundingClientRect().left == prevSpan.getBoundingClientRect().left) {"
+                                                                                        + "                             prevPrevSpan.textContent = prevPrevSpan.textContent + prevSpan.textContent;"
+                                                                                        + "                             textNode.parentElement.removeChild(prevSpan);"
+                                                                                        + "                     }"
+                                                                                        );
+                                                                }
+                                                                JavaFX.WebEngineExecuteScript.invoke(webEngine, ""
+                                                                                + "                     if (prevPrevSpan !== null && prevPrevSpan.getBoundingClientRect().left == prevSpan.getBoundingClientRect().left) {"
+                                                                                + "                             prevPrevSpan.textContent = prevPrevSpan.textContent + prevSpan.textContent;"
+                                                                                + "                             par.removeChild(prevSpan);"
+                                                                                + "                     }"
+                                                                                );
+                                                        }
+                                                }
+                                                // Using Javascript to get an array of all the nodes in the document
+                                                Object nodes = JavaFX.WebEngineExecuteScript.invoke(webEngine,
+                                                                "function getTextNodes(rootNode){"
+                                                                        + "var node;" + "var textNodes=[];"
+                                                                        + "var walk = document.createTreeWalker(rootNode, NodeFilter.SHOW_ALL);"
+                                                                        + "while(node=walk.nextNode()) {"
+                                                                                + " textNodes.push(node);"
+                                                                        + "}"
+                                                                        + "return textNodes;"
+                                                                + "}; "
+                                                                + "getTextNodes(document.body)");
+                                                int nodesLength = (Integer) JavaFX.JSObjectGetMember.invoke(nodes, "length");
+                                                for (int i = 0; i < nodesLength; i++) {
+                                                        Node currentNode = (Node) JavaFX.JSObjectGetSlot.invoke(nodes, i);
+                                                // Creating a TreeWalker that is used to loop over all the nodes within the document
+                                                JavaFX.WebEngineExecuteScript.invoke(webEngine, "var walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ALL);");
+                                                Node currentNode;
+                                                // Looping through all the text nodes in the current paragraph
+                                                while ((currentNode = (Node) JavaFX.WebEngineExecuteScript.invoke(webEngine, "walker.nextNode()")) != null) {
                                                         if (currentNode.getNodeType() == Node.TEXT_NODE || currentNode.getNodeType() == Node.ELEMENT_NODE) {
 …
                                                                                 Text t;
+                                                                                String textContent = currentNode.getTextContent(); // .replaceAll("[^\\S\\n]+", " ").trim();
+                                                                                String textContent = currentNode.getTextContent().replaceAll("[^\\S\\n]+", " ");
+                                                                                textContent = textContent.replaceAll("^(\\s)(\\n|\\r)", "");
                                                                                 if (textTransform.equals("uppercase")) {
 …
                 while (matcher.find()) {
                         String w = toSplit.substring(prevEndIndex, matcher.start());
+                        System.out.println(toSplit.substring(0, 0));
                         if (prev != null) {
                                 words.add(prev + " ");

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 688 for trunk

Legend:

trunk/src/org/expeditee/io/WebParser.java

Download in other formats: