1 | /**
|
---|
2 | * DocumentStatsFast.java
|
---|
3 | * Copyright (C) 2010 New Zealand Digital Library, http://expeditee.org
|
---|
4 | *
|
---|
5 | * This program is free software: you can redistribute it and/or modify
|
---|
6 | * it under the terms of the GNU General Public License as published by
|
---|
7 | * the Free Software Foundation, either version 3 of the License, or
|
---|
8 | * (at your option) any later version.
|
---|
9 | *
|
---|
10 | * This program is distributed in the hope that it will be useful,
|
---|
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
13 | * GNU General Public License for more details.
|
---|
14 | *
|
---|
15 | * You should have received a copy of the GNU General Public License
|
---|
16 | * along with this program. If not, see <http://www.gnu.org/licenses/>.
|
---|
17 | */
|
---|
18 |
|
---|
19 | package org.expeditee.stats;
|
---|
20 |
|
---|
21 | import java.io.BufferedReader;
|
---|
22 | import java.io.FileNotFoundException;
|
---|
23 | import java.io.FileReader;
|
---|
24 | import java.io.IOException;
|
---|
25 | import java.util.HashSet;
|
---|
26 | import java.util.Set;
|
---|
27 |
|
---|
28 | import org.expeditee.gui.AttributeValuePair;
|
---|
29 | import org.expeditee.gui.FrameIO;
|
---|
30 | import org.expeditee.gui.MessageBay;
|
---|
31 | import org.expeditee.io.Conversion;
|
---|
32 | import org.expeditee.settings.folders.FolderSettings;
|
---|
33 |
|
---|
34 | public class DocumentStatsFast extends Stats {
|
---|
35 | protected int _treeFrames = 0;
|
---|
36 |
|
---|
37 | protected int _characters = 0;
|
---|
38 |
|
---|
39 | protected int _words = 0;
|
---|
40 |
|
---|
41 | protected int _textItems = 0;
|
---|
42 |
|
---|
43 | protected int _sentences = 0;
|
---|
44 |
|
---|
45 | protected String _name = null;
|
---|
46 |
|
---|
47 | protected String _title = null;
|
---|
48 |
|
---|
49 | public static int wordCount(String paragraph) {
|
---|
50 | return paragraph.trim().split("\\s+").length + 1;
|
---|
51 | }
|
---|
52 |
|
---|
53 | public DocumentStatsFast(String topFrame, String title) {
|
---|
54 | this(topFrame, new HashSet<String>());
|
---|
55 | _title = title;
|
---|
56 | }
|
---|
57 |
|
---|
58 | public DocumentStatsFast(String topFrame, Set<String> visited) {
|
---|
59 | _name = topFrame;
|
---|
60 | String lowerName = _name.toLowerCase();
|
---|
61 |
|
---|
62 | if (visited.contains(lowerName)) {
|
---|
63 | return;
|
---|
64 | }
|
---|
65 |
|
---|
66 | visited.add(_name.toLowerCase());
|
---|
67 | MessageBay.overwriteMessage("Computed: " + _name);
|
---|
68 |
|
---|
69 | // Initialise variables with the data for this frames comet
|
---|
70 | _words = 0;
|
---|
71 | _characters = 0;
|
---|
72 | _textItems = 0;
|
---|
73 | _sentences = 0;
|
---|
74 | _treeFrames = 1;
|
---|
75 |
|
---|
76 | String fullPath = null;
|
---|
77 | for (String possiblePath : FolderSettings.FrameDirs.getAbsoluteDirs()) {
|
---|
78 | fullPath = FrameIO.getFrameFullPathName(possiblePath, _name);
|
---|
79 | if (fullPath != null)
|
---|
80 | break;
|
---|
81 | }
|
---|
82 |
|
---|
83 | // If the frame was not located return null
|
---|
84 | if (fullPath == null)
|
---|
85 | return;
|
---|
86 |
|
---|
87 | String frameset = Conversion.getFramesetName(_name);
|
---|
88 |
|
---|
89 | // Open the file and search the text items
|
---|
90 | try {
|
---|
91 | BufferedReader reader = new BufferedReader(new FileReader(fullPath));
|
---|
92 | String next;
|
---|
93 | StringBuffer sb = new StringBuffer();
|
---|
94 | String link = null;
|
---|
95 | boolean ignore = false;
|
---|
96 | while (reader.ready() && ((next = reader.readLine()) != null)) {
|
---|
97 | if (next.length() == 0) {
|
---|
98 | // Ignore annotations
|
---|
99 | if (ignore) {
|
---|
100 | ignore = false;
|
---|
101 | link = null;
|
---|
102 | continue;
|
---|
103 | }
|
---|
104 |
|
---|
105 | // Ignore non text items
|
---|
106 | if (sb.length() == 0) {
|
---|
107 | link = null;
|
---|
108 | continue;
|
---|
109 | }
|
---|
110 |
|
---|
111 | if (link == null) {
|
---|
112 | // remove the last newLine... not absolutely needed
|
---|
113 | String text = sb.substring(0, sb.length() - 1);
|
---|
114 | _textItems++;
|
---|
115 | _characters += text.length();
|
---|
116 | _words += text.split("\\s+").length;
|
---|
117 | _sentences += text.split("\\.+").length;
|
---|
118 | } else {
|
---|
119 | DocumentStatsFast childItemStats = new DocumentStatsFast(
|
---|
120 | link, visited);
|
---|
121 | _characters += childItemStats._characters;
|
---|
122 | _words += childItemStats._words;
|
---|
123 | _textItems += childItemStats._textItems;
|
---|
124 | _sentences += childItemStats._sentences;
|
---|
125 | _treeFrames += childItemStats._treeFrames;
|
---|
126 | }
|
---|
127 | // Reinit the item variables
|
---|
128 | link = null;
|
---|
129 | sb = new StringBuffer();
|
---|
130 | } else if (ignore) {
|
---|
131 | continue;
|
---|
132 | } else if (next.startsWith("T")) {
|
---|
133 | String text = next.substring(2).trim();
|
---|
134 | // Ignore the rest of annotation items...
|
---|
135 | if (text.length() > 0
|
---|
136 | && text.charAt(0) == AttributeValuePair.ANNOTATION_CHAR) {
|
---|
137 | ignore = true;
|
---|
138 | continue;
|
---|
139 | }
|
---|
140 | sb.append(text).append('\n');
|
---|
141 | } else if (next.startsWith("F")) {
|
---|
142 | link = next.substring(2);
|
---|
143 | // Convert number only links
|
---|
144 | if (Character.isDigit(link.charAt(0)))
|
---|
145 | link = frameset + link;
|
---|
146 | }
|
---|
147 | }
|
---|
148 | } catch (FileNotFoundException e) {
|
---|
149 | e.printStackTrace();
|
---|
150 | } catch (IOException e) {
|
---|
151 | e.printStackTrace();
|
---|
152 | }
|
---|
153 | }
|
---|
154 |
|
---|
155 | @Override
|
---|
156 | public String toString() {
|
---|
157 | StringBuffer sb = new StringBuffer();
|
---|
158 | sb.append(SessionStats.getDate());
|
---|
159 | sb.append("DocStats: ").append(_name).append('\n');
|
---|
160 | sb.append("Title: ").append(_title).append('\n');
|
---|
161 | sb.append("Frames: ").append(_treeFrames).append('\n');
|
---|
162 | sb.append("TextItems: ").append(_textItems).append('\n');
|
---|
163 | sb.append("Sentences: ").append(_sentences).append('\n');
|
---|
164 | sb.append("Words: ").append(_words).append('\n');
|
---|
165 | sb.append("Chars: ").append(_characters);
|
---|
166 | return sb.toString();
|
---|
167 | }
|
---|
168 | }
|
---|