- Timestamp:
- 01/13/14 15:43:42 (10 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/org/expeditee/io/WebParser.java
r672 r688 139 139 140 140 // Functions to be used later in JavaScript 141 JavaFX.WebEngineExecuteScript 142 .invoke(webEngine, 143 "function isContent(el) { " 144 + " if (el.tagName === 'P') {" 145 + " return true;" 146 + " }" 147 + " var text = el.textContent.replace(/(^\\s*)|(\\s*$)/, '');" 148 + " var w = text.split(/\\S\\s\\S/).length; var punct = text.split(/\\.|\\,|\\!/).length;" 149 + " if (punct < 2 || w < 10) {" 150 + " return false;" 151 + " } else { " 152 + " return true;" 153 + " }" 154 + "}" 155 141 JavaFX.WebEngineExecuteScript.invoke(webEngine, "" 156 142 + "function addToSpan(text) {" 157 143 + " span = document.createElement('wordSpan');" 158 144 + " span.textContent = text;" 159 + " textNode.parentElement.insertBefore(span, textNode);"145 + " par.insertBefore(span, refNode);" 160 146 + " if (prevSpan !== null && span.getBoundingClientRect().top > prevSpan.getBoundingClientRect().top) {" 161 147 + " span.textContent = '\\n' + span.textContent;" 162 148 + " if ( prevPrevSpan !== null && prevPrevSpan.getBoundingClientRect().left == prevSpan.getBoundingClientRect().left) {" 163 + " prevPrevSpan.textContent = prevPrevSpan.textContent + prevSpan.textContent;" 164 + " textNode.parentElement.removeChild(prevSpan);"149 + " prevPrevSpan.textContent = prevPrevSpan.textContent + prevSpan.textContent;" 150 + " par.removeChild(prevSpan);" 165 151 + " } else {" 166 + " prevPrevSpan = prevSpan;" 167 + " }" 152 + " prevPrevSpan = prevSpan;" 153 + " }" 168 154 + " prevSpan = span;" 169 155 + " } else if ( prevSpan !== null) {" 170 + " prevSpan.textContent = prevSpan.textContent + span.textContent;" 171 + " textNode.parentElement.removeChild(span);"172 + " } else {" 173 + " prevSpan = span;" 156 + " prevSpan.textContent = prevSpan.textContent + span.textContent;" 157 + " par.removeChild(span);" 158 + " } else {" 159 + " prevSpan = span;" 174 160 + " }" 175 161 + "}" … … 182 168 183 169 for (int i = 0; i < contentElementsLength; i++) { 184 185 // Getting the current HTML element, then making it access able in JavaScript170 System.out.println(i + "/" + contentElementsLength); 171 // Getting the current HTML element, then making it accessible in JavaScript 186 172 Element currentElement = (Element) JavaFX.JSObjectGetSlot.invoke(contentElements, i); 187 173 JavaFX.JSObjectSetMember.invoke(window, "para", currentElement); 188 189 if ((boolean) (JavaFX.WebEngineExecuteScript.invoke(webEngine, "isContent(para)"))) {190 174 191 JavaFX.WebEngineExecuteScript.invoke(webEngine, "para.style.wordBreak = 'normal';"); 192 193 // Creating a TreeWalker that is used to loop over all the TextNodes within the current paragraph 194 JavaFX.WebEngineExecuteScript.invoke(webEngine, "var walker = document.createTreeWalker(para, NodeFilter.SHOW_TEXT, null, false);"); 195 196 Node textNode; 175 JavaFX.WebEngineExecuteScript.invoke(webEngine, "para.style.wordBreak = 'normal';"); 176 177 // Creating a TreeWalker that is used to loop over all the TextNodes within the current paragraph 178 JavaFX.WebEngineExecuteScript.invoke(webEngine, "var walker = document.createTreeWalker(para, NodeFilter.SHOW_TEXT, null, false);"); 179 180 // Using Javascript to get an array of all the text nodes in the current node. Have to loop through twice (once 181 // to 182 // build the array and once actually going through the array, otherwise when the textnode is removed from 183 // the document items end up being skipped) 184 Object textNodes = JavaFX.WebEngineExecuteScript.invoke(webEngine, "function getTextNodes(rootNode){" + "var node;" + "var textNodes=[];" 185 + "var walk = document.createTreeWalker(rootNode, NodeFilter.SHOW_TEXT);" + "while(node=walk.nextNode()) {" + "if((node.textContent.trim().length > 0)) { " 186 + "textNodes.push(node);" + "}" + "}" + "return textNodes;" + "}; " + "getTextNodes(para)"); 187 188 int nodesLength = (Integer) JavaFX.JSObjectGetMember.invoke(textNodes, "length"); 189 190 // Looping through all the text nodes in the current paragraph 191 for (int j = 0; j < nodesLength; j++) { 192 Node currentNode = (Node) JavaFX.JSObjectGetSlot.invoke(textNodes, j); 193 194 // Making the current node accessible in JavaScript 195 JavaFX.JSObjectSetMember.invoke(window, "textNode", currentNode); 197 196 197 JavaFX.WebEngineExecuteScript.invoke(webEngine, "" 198 + "var span = null;" 199 + "var prevSpan = null;" 200 + "var prevPrevSpan = null;" 201 ); 202 203 // Splitting the text node's content into individual words 204 String textContent = ((String) JavaFX.WebEngineExecuteScript.invoke(webEngine, "textNode.textContent")).replaceAll("\\n|\\r", "").replaceAll("\\s+", " "); 205 String[] words = splitIntoWords(textContent); 206 207 JavaFX.WebEngineExecuteScript.invoke(webEngine, "" 208 + "var refNode = textNode.nextSibling;" 209 + "var par = textNode.parentElement;" 210 + "textNode.parentElement.removeChild(textNode)"); 211 212 // Adding each word back to the page 213 for (int k = 0; k < words.length; k++) { 214 Object currentWord = words[k]; 215 JavaFX.JSObjectCall.invoke(window, "addToSpan", new Object[] { currentWord }); 216 } 198 217 199 200 // Looping through all the text nodes in the current paragraph 201 while ((textNode = (Node) JavaFX.WebEngineExecuteScript.invoke(webEngine, "walker.nextNode()")) != null) { 202 // Making the current node accessible in JavaScript 203 JavaFX.JSObjectSetMember.invoke(window, "textNode", textNode); 204 205 JavaFX.WebEngineExecuteScript.invoke(webEngine, "" 206 + "var span = null;" 207 + "var prevSpan = null;" 208 + "var prevPrevSpan = null;" 209 ); 210 211 // Splitting the text node's content into individual words 212 String textContent = (String) JavaFX.WebEngineExecuteScript.invoke(webEngine, "textNode.textContent"); 213 String[] words = splitIntoWords(textContent); 214 215 // Clearing all text from the current text node (but not removing it, as it is needed as a reference 216 // point for adding back the words) 217 JavaFX.WebEngineExecuteScript.invoke(webEngine, "textNode.textContent = '';"); 218 219 // Adding each word back to the page 220 for (int j = 0; j < words.length; j++) { 221 Object currentWord = words[j]; 222 JavaFX.JSObjectCall.invoke(window, "addToSpan", new Object[] { currentWord }); 223 } 224 225 JavaFX.WebEngineExecuteScript.invoke(webEngine, "" 226 + " if (prevPrevSpan !== null && prevPrevSpan.getBoundingClientRect().left == prevSpan.getBoundingClientRect().left) {" 227 + " prevPrevSpan.textContent = prevPrevSpan.textContent + prevSpan.textContent;" 228 + " textNode.parentElement.removeChild(prevSpan);" 229 + " }" 230 ); 231 } 218 JavaFX.WebEngineExecuteScript.invoke(webEngine, "" 219 + " if (prevPrevSpan !== null && prevPrevSpan.getBoundingClientRect().left == prevSpan.getBoundingClientRect().left) {" 220 + " prevPrevSpan.textContent = prevPrevSpan.textContent + prevSpan.textContent;" 221 + " par.removeChild(prevSpan);" 222 + " }" 223 ); 232 224 } 225 233 226 } 234 227 235 // Using Javascript to get an array of all the nodes in the document 236 Object nodes = JavaFX.WebEngineExecuteScript.invoke(webEngine, 237 "function getTextNodes(rootNode){" 238 + "var node;" + "var textNodes=[];" 239 + "var walk = document.createTreeWalker(rootNode, NodeFilter.SHOW_ALL);" 240 + "while(node=walk.nextNode()) {" 241 + " textNodes.push(node);" 242 + "}" 243 + "return textNodes;" 244 + "}; " 245 + "getTextNodes(document.body)"); 246 247 int nodesLength = (Integer) JavaFX.JSObjectGetMember.invoke(nodes, "length"); 248 249 for (int i = 0; i < nodesLength; i++) { 250 Node currentNode = (Node) JavaFX.JSObjectGetSlot.invoke(nodes, i); 228 // Creating a TreeWalker that is used to loop over all the nodes within the document 229 JavaFX.WebEngineExecuteScript.invoke(webEngine, "var walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ALL);"); 230 231 Node currentNode; 232 233 // Looping through all the text nodes in the current paragraph 234 while ((currentNode = (Node) JavaFX.WebEngineExecuteScript.invoke(webEngine, "walker.nextNode()")) != null) { 251 235 252 236 if (currentNode.getNodeType() == Node.TEXT_NODE || currentNode.getNodeType() == Node.ELEMENT_NODE) { … … 380 364 381 365 Text t; 382 String textContent = currentNode.getTextContent(); // .replaceAll("[^\\S\\n]+", " ").trim(); 366 367 String textContent = currentNode.getTextContent().replaceAll("[^\\S\\n]+", " "); 368 textContent = textContent.replaceAll("^(\\s)(\\n|\\r)", ""); 383 369 384 370 if (textTransform.equals("uppercase")) { … … 698 684 while (matcher.find()) { 699 685 String w = toSplit.substring(prevEndIndex, matcher.start()); 700 System.out.println(toSplit.substring(0, 0)); 686 701 687 if (prev != null) { 702 688 words.add(prev + " ");
Note:
See TracChangeset
for help on using the changeset viewer.