1 | /**
|
---|
2 | * Greenstone3Connection.java
|
---|
3 | * Copyright (C) 2010 New Zealand Digital Library, http://expeditee.org
|
---|
4 | *
|
---|
5 | * This program is free software: you can redistribute it and/or modify
|
---|
6 | * it under the terms of the GNU General Public License as published by
|
---|
7 | * the Free Software Foundation, either version 3 of the License, or
|
---|
8 | * (at your option) any later version.
|
---|
9 | *
|
---|
10 | * This program is distributed in the hope that it will be useful,
|
---|
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
13 | * GNU General Public License for more details.
|
---|
14 | *
|
---|
15 | * You should have received a copy of the GNU General Public License
|
---|
16 | * along with this program. If not, see <http://www.gnu.org/licenses/>.
|
---|
17 | */
|
---|
18 |
|
---|
19 | package org.expeditee.greenstone;
|
---|
20 |
|
---|
21 | import java.io.BufferedReader;
|
---|
22 | import java.io.IOException;
|
---|
23 | import java.io.InputStreamReader;
|
---|
24 | import java.io.PrintWriter;
|
---|
25 | import java.io.StringReader;
|
---|
26 | import java.net.Socket;
|
---|
27 | import java.net.UnknownHostException;
|
---|
28 | import java.util.ArrayList;
|
---|
29 | import java.util.Collections;
|
---|
30 | import java.util.HashMap;
|
---|
31 | import java.util.HashSet;
|
---|
32 | import java.util.Iterator;
|
---|
33 | import java.util.List;
|
---|
34 | import java.util.ListIterator;
|
---|
35 | import java.util.Map;
|
---|
36 | import java.util.Set;
|
---|
37 |
|
---|
38 | import org.apache.xerces.parsers.DOMParser;
|
---|
39 | import org.w3c.dom.Document;
|
---|
40 | import org.w3c.dom.NamedNodeMap;
|
---|
41 | import org.w3c.dom.Node;
|
---|
42 | import org.w3c.dom.NodeList;
|
---|
43 | import org.xml.sax.InputSource;
|
---|
44 | import org.xml.sax.SAXException;
|
---|
45 |
|
---|
46 | /**
|
---|
47 | * This class provides a simple API for communicating with a Greenstone 3 server
|
---|
48 | * using SOAP.
|
---|
49 | * <p>
|
---|
50 | * Greenstone 3 does not yet 'properly' implement SOAP-based web services. We
|
---|
51 | * would like to use a Greenstone WSDL (Web Services Definition Language) file
|
---|
52 | * and a higher level SOAP Client interface. But we can't. To get around this,
|
---|
53 | * this API uses a simple socket connection to the Greenstone 3 server, and
|
---|
54 | * sends SOAP requests as strings (XML documents). This works but isn't elegant.
|
---|
55 | * The server responds with a string representing an XML document.
|
---|
56 | * <p>
|
---|
57 | * The server's hostname and port are hard-coded. <b>Do not modify them.</b>
|
---|
58 | * <p>
|
---|
59 | * The Greenstone collection to use is <i>hcibib</i>, and this is also
|
---|
60 | * hard-coded. <b>Do not modify this.</b>
|
---|
61 | * <p>
|
---|
62 | * This collection can be accessed from a web browser at <a
|
---|
63 | * href="http://delaware.resnet.scms.waikato.ac.nz:8111/greenstone3/library?a=p&sa=about&c=hcibib">
|
---|
64 | * this location</a>.
|
---|
65 | */
|
---|
66 | public class Greenstone3Connection {
|
---|
67 | /** an ordered list of {@link Query} objects */
|
---|
68 | private List<Query> queryList;
|
---|
69 |
|
---|
70 | /**
|
---|
71 | * a HashMap of {@link ResultDocument} objects with document IDs as the
|
---|
72 | * keys. All the results returned in this session.
|
---|
73 | */
|
---|
74 | private Map<String, ResultDocument> allResults;
|
---|
75 |
|
---|
76 | /**
|
---|
77 | * a HashMap keyed on the keywords found for all documents returned in this
|
---|
78 | * session. Each item in the map is itself a HashMap, keyed on document IDs
|
---|
79 | * with each item being NULL.
|
---|
80 | */
|
---|
81 | private Map<String, Set<String>> allKeywords;
|
---|
82 |
|
---|
83 | /**
|
---|
84 | * a set of authors names
|
---|
85 | */
|
---|
86 | private Map<String, Set<String>> allAuthors;
|
---|
87 |
|
---|
88 | /**
|
---|
89 | * a HashMap keyed on the publication dates found for all documents returned
|
---|
90 | * in this session. Each item in the map is itself a HashMap, keyed on
|
---|
91 | * document IDs with each item being NULL.
|
---|
92 | */
|
---|
93 | private Map<String, Set<String>> allDates;
|
---|
94 |
|
---|
95 | /**
|
---|
96 | * a HashMap keyed on the journal names found for all documents returned in
|
---|
97 | * this session. Each item in the map is itself a HashMap, keyed on document
|
---|
98 | * IDs with each item being NULL.
|
---|
99 | */
|
---|
100 | private Map<String, Set<String>> allJournals;
|
---|
101 |
|
---|
102 | /**
|
---|
103 | * a HashMap keyed on the book titles found for all documents returned in
|
---|
104 | * this session. Each item in the map is itself a HashMap, keyed on document
|
---|
105 | * IDs with each item being NULL.
|
---|
106 | */
|
---|
107 | private Map<String, Set<String>> allBooktitles;
|
---|
108 |
|
---|
109 | /** the <i>hostname</i> where the Greenstone 3 server is running */
|
---|
110 | private String hostname;
|
---|
111 |
|
---|
112 | /** the <i>port</i> on which the Greenstone 3 server is running */
|
---|
113 | private int port;
|
---|
114 |
|
---|
115 | /** for communication with the server */
|
---|
116 | private Socket socket = null;
|
---|
117 |
|
---|
118 | /** for writing the SOAP request strings to the server socket */
|
---|
119 | private PrintWriter toGSDL = null;
|
---|
120 |
|
---|
121 | /** for reading the SOAP response strings from the server socket */
|
---|
122 | private BufferedReader fromGSDL = null;
|
---|
123 |
|
---|
124 | /** string that starts every SOAP request */
|
---|
125 | private String SOAPrequestHeader;
|
---|
126 |
|
---|
127 | /** acts as a template for every SOAP request string */
|
---|
128 | private String SOAPrequestMessage = "<?xml version='1.0' encoding='UTF-8'?><soapenv:Envelope xmlns:soapenv='http://schemas.xmlsoap.org/soap/envelope/' xmlns:xsd='http://www.w3.org/2001/XMLSchema' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'><soapenv:Body><message><request lang='en' to='hcibib/PROCESSNAME' type='PROCESSTYPE'>REQUESTBODY</request></message></soapenv:Body></soapenv:Envelope>";
|
---|
129 |
|
---|
130 | /**
|
---|
131 | * A client application using this API will normally only create one
|
---|
132 | * instance of this class.
|
---|
133 | * <p>
|
---|
134 | * Create an instance with something like this
|
---|
135 | *
|
---|
136 | * <pre>
|
---|
137 | * Greenstone3Connection gsdl = new Greenstone3Connection();
|
---|
138 | * </pre>
|
---|
139 | *
|
---|
140 | * The constructor initialises the following <b>private</b> variables...
|
---|
141 | * <ul>
|
---|
142 | * <li>the <i>hostname</i> where the Greenstone 3 server is running</li>
|
---|
143 | * <li>the <i>port</i> on which the Greenstone 3 server is running</li>
|
---|
144 | * <li><i>queryList</i> an ordered list of {@link Query} objects</li>
|
---|
145 | * <li><i>allResults</i> a HashMap of {@link ResultDocument} objects with
|
---|
146 | * document IDs as the keys. All the results returned in this session.</li>
|
---|
147 | * <li><i>allKeywords</i> a HashMap keyed on the keywords found for all
|
---|
148 | * documents returned in this session. Each item in the map is itself a
|
---|
149 | * HashMap, keyed on document IDs with each item being NULL.</li>
|
---|
150 | * <li><i>allAuthors</i> a HashMap keyed on the author names found for all
|
---|
151 | * documents returned in this session. Each item in the map is itself a
|
---|
152 | * HashMap, keyed on document IDs with each item being NULL.</li>
|
---|
153 | * <li><i>allDates</i> a HashMap keyed on the publication dates found for
|
---|
154 | * all documents returned in this session. Each item in the map is itself a
|
---|
155 | * HashMap, keyed on document IDs with each item being NULL.</li>
|
---|
156 | * <li><i>allJournals</i> a HashMap keyed on the journal names found for
|
---|
157 | * all documents returned in this session. Each item in the map is itself a
|
---|
158 | * HashMap, keyed on document IDs with each item being NULL.</li>
|
---|
159 | * <li><i>allBooktitles</i> a HashMap keyed on the book titles found for
|
---|
160 | * all documents returned in this session. Each item in the map is itself a
|
---|
161 | * HashMap, keyed on document IDs with each item being NULL.</li>
|
---|
162 | * </ul>
|
---|
163 | */
|
---|
164 | public Greenstone3Connection(int location) {
|
---|
165 | if (location == 0) {
|
---|
166 | this.hostname = "comp537.cs.waikato.ac.nz";
|
---|
167 | this.port = 80;
|
---|
168 | this.SOAPrequestHeader = "POST /greenstone3/services/localsite HTTP/1.1\nHost: comp537.cs.waikato.ac.nz:80\nSOAPAction: hcibib/PROCESSNAME\nContent-Type: text/xml;charset=utf-8\nContent-Length: ";
|
---|
169 | } else {
|
---|
170 | this.hostname = "130.217.220.10";
|
---|
171 | this.port = 8111;
|
---|
172 | this.SOAPrequestHeader = "POST /greenstone3/services/localsite HTTP/1.1\nHost: 130.217.220.10:8111\nSOAPAction: hcibib/PROCESSNAME\nContent-Type: text/xml;charset=utf-8\nContent-Length: ";
|
---|
173 | }
|
---|
174 | this.queryList = Collections.synchronizedList(new ArrayList<Query>());
|
---|
175 | this.allResults = Collections
|
---|
176 | .synchronizedMap(new HashMap<String, ResultDocument>());
|
---|
177 | this.allKeywords = Collections.synchronizedMap(new HashMap<String, Set<String>>());
|
---|
178 | this.allAuthors = Collections
|
---|
179 | .synchronizedMap(new HashMap<String, Set<String>>());
|
---|
180 | this.allDates = Collections.synchronizedMap(new HashMap<String, Set<String>>());
|
---|
181 | this.allJournals = Collections.synchronizedMap(new HashMap<String, Set<String>>());
|
---|
182 | this.allBooktitles = Collections.synchronizedMap(new HashMap<String, Set<String>>());
|
---|
183 | }
|
---|
184 |
|
---|
185 | public Map<String, ResultDocument> getSessionResults() {
|
---|
186 | return this.allResults;
|
---|
187 | };
|
---|
188 |
|
---|
189 | /**
|
---|
190 | * Print a string representation of the list of queries issued in this
|
---|
191 | * session.
|
---|
192 | */
|
---|
193 | public void dumpQueryList() {
|
---|
194 | ListIterator iter = queryList.listIterator();
|
---|
195 | while (iter.hasNext()) {
|
---|
196 | Query query = (Query) iter.next();
|
---|
197 | System.out.println(query.toString());
|
---|
198 | }
|
---|
199 | }
|
---|
200 |
|
---|
201 | /**
|
---|
202 | * Print a string representation of the Booktitles occuring for all query
|
---|
203 | * results in this session. For each booktitle print the IDs of the
|
---|
204 | * documents with that booktitle.
|
---|
205 | */
|
---|
206 | public void dumpAllBooktitles() {
|
---|
207 | Set keys = allBooktitles.keySet();
|
---|
208 | Iterator iter = keys.iterator();
|
---|
209 | while (iter.hasNext()) {
|
---|
210 | String booktitle = (String) iter.next();
|
---|
211 | HashMap docMap = (HashMap) allBooktitles.get(booktitle);
|
---|
212 | System.out.println(booktitle);
|
---|
213 | System.out.println(docMap.keySet().toString());
|
---|
214 | }
|
---|
215 | }
|
---|
216 |
|
---|
217 | /**
|
---|
218 | * Print a string representation of the Journals occuring for all query
|
---|
219 | * results in this session. For each journal print the IDs of the documents
|
---|
220 | * with that journal.
|
---|
221 | */
|
---|
222 | public void dumpAllJournals() {
|
---|
223 | Set keys = allJournals.keySet();
|
---|
224 | Iterator iter = keys.iterator();
|
---|
225 | while (iter.hasNext()) {
|
---|
226 | String journal = (String) iter.next();
|
---|
227 | HashMap docMap = (HashMap) allJournals.get(journal);
|
---|
228 | System.out.println(journal);
|
---|
229 | System.out.println(docMap.keySet().toString());
|
---|
230 | }
|
---|
231 | }
|
---|
232 |
|
---|
233 | /**
|
---|
234 | * Print a string representation of the Dates occuring for all query results
|
---|
235 | * in this session. For each date print the IDs of the documents with that
|
---|
236 | * date.
|
---|
237 | */
|
---|
238 | public void dumpAllDates() {
|
---|
239 | Set keys = allDates.keySet();
|
---|
240 | Iterator iter = keys.iterator();
|
---|
241 | while (iter.hasNext()) {
|
---|
242 | String date = (String) iter.next();
|
---|
243 | HashMap docMap = (HashMap) allDates.get(date);
|
---|
244 | System.out.println(date);
|
---|
245 | System.out.println(docMap.keySet().toString());
|
---|
246 | }
|
---|
247 | }
|
---|
248 |
|
---|
249 | /**
|
---|
250 | * Print a string representation of the Authors occuring for all query
|
---|
251 | * results in this session. For each author print the IDs of the documents
|
---|
252 | * with that author.
|
---|
253 | */
|
---|
254 | public void dumpAllAuthors() {
|
---|
255 | Set keys = allAuthors.keySet();
|
---|
256 | Iterator iter = keys.iterator();
|
---|
257 | while (iter.hasNext()) {
|
---|
258 | String author = (String) iter.next();
|
---|
259 | HashMap docMap = (HashMap) allAuthors.get(author);
|
---|
260 | System.out.println(author);
|
---|
261 | System.out.println(docMap.keySet().toString());
|
---|
262 | }
|
---|
263 | }
|
---|
264 |
|
---|
265 | /**
|
---|
266 | * Print a string representation of the Keywords occuring for all query
|
---|
267 | * results in this session. For each keyword print the IDs of the documents
|
---|
268 | * with that keyword.
|
---|
269 | */
|
---|
270 | public void dumpAllKeywords() {
|
---|
271 | Set keys = allKeywords.keySet();
|
---|
272 | Iterator iter = keys.iterator();
|
---|
273 | while (iter.hasNext()) {
|
---|
274 | String keyword = (String) iter.next();
|
---|
275 | HashMap docMap = (HashMap) allKeywords.get(keyword);
|
---|
276 | System.out.println(keyword);
|
---|
277 | System.out.println(docMap.keySet().toString());
|
---|
278 | }
|
---|
279 | }
|
---|
280 |
|
---|
281 | /**
|
---|
282 | * Print a string representation of all the result documents returned by
|
---|
283 | * queries in this session.
|
---|
284 | */
|
---|
285 | public void dumpAllResults() {
|
---|
286 | Set keys = allResults.keySet();
|
---|
287 | Iterator iter = keys.iterator();
|
---|
288 |
|
---|
289 | while (iter.hasNext()) {
|
---|
290 | String docID = (String) iter.next();
|
---|
291 | ResultDocument resultDocument = allResults.get(docID);
|
---|
292 | System.out.println("____________" + docID + " ___________");
|
---|
293 | System.out.println(resultDocument.toString());
|
---|
294 | }
|
---|
295 | }
|
---|
296 |
|
---|
297 | /**
|
---|
298 | * Print all the result documents IDs returned by queries in this session,
|
---|
299 | * along with their titles.
|
---|
300 | */
|
---|
301 | public void dumpAllTitles() {
|
---|
302 | Set keys = allResults.keySet();
|
---|
303 | Iterator iter = keys.iterator();
|
---|
304 | while (iter.hasNext()) {
|
---|
305 | String docID = (String) iter.next();
|
---|
306 | ResultDocument resultDocument = allResults.get(docID);
|
---|
307 | System.out.println(docID + "\t" + resultDocument.getTitle());
|
---|
308 | }
|
---|
309 | }
|
---|
310 |
|
---|
311 | /**
|
---|
312 | * Provides the {@link ResultDocument} object for the document with the
|
---|
313 | * given ID
|
---|
314 | *
|
---|
315 | * @param docID
|
---|
316 | * is a document identifier, in the form returned by the server
|
---|
317 | * and available from a {@link QueryOutcome}
|
---|
318 | * @return the {@link ResultDocument} object reflecting the state of the
|
---|
319 | * result document at the time that this method was called. The
|
---|
320 | * state can change as more metadata is retrieved for the document
|
---|
321 | * and the document is returned by further queries.
|
---|
322 | */
|
---|
323 | public ResultDocument getDocument(String docID) {
|
---|
324 | return allResults.get(docID);
|
---|
325 | }
|
---|
326 |
|
---|
327 | /**
|
---|
328 | * Implements the actual communication with the server. <b>You can not call
|
---|
329 | * this method directly from your client code.</b>
|
---|
330 | * <p>
|
---|
331 | * Throws an exception and exits if the hosthame is not known or the
|
---|
332 | * connection can't be established.
|
---|
333 | * <p>
|
---|
334 | *
|
---|
335 | * @param request
|
---|
336 | * an already well formed string that contains the appropriate
|
---|
337 | * HTTP headers and a SOAP message (in XML form) that will ask
|
---|
338 | * the server for some information.
|
---|
339 | * @return a string containing a SOAP message (an XML document) that the
|
---|
340 | * server returned in response to the request
|
---|
341 | */
|
---|
342 | private String doRequest(String request) {
|
---|
343 | // System.err.println("Connecting to " + hostname + " on port " + port);
|
---|
344 | try {
|
---|
345 | try {
|
---|
346 | socket = new Socket(hostname, port);
|
---|
347 | } catch (SecurityException se) {
|
---|
348 | System.err.println("Security exception : " + se);
|
---|
349 | System.exit(1);
|
---|
350 | }
|
---|
351 | toGSDL = new PrintWriter(socket.getOutputStream(), true);
|
---|
352 | fromGSDL = new BufferedReader(new InputStreamReader(socket
|
---|
353 | .getInputStream()));
|
---|
354 | } catch (UnknownHostException e) {
|
---|
355 | System.err.println("Don't know about GSDL host: " + hostname);
|
---|
356 | System.exit(1);
|
---|
357 | } catch (IOException e) {
|
---|
358 | System.err.println("IO exception : " + e);
|
---|
359 | System.exit(1);
|
---|
360 | }
|
---|
361 |
|
---|
362 | String result = null;
|
---|
363 | toGSDL.println(request);
|
---|
364 | // System.err.println("Issued request to " + hostname + " on port " +
|
---|
365 | // port);
|
---|
366 | try {
|
---|
367 | String terminator = "Envelope>";
|
---|
368 | String response = "";
|
---|
369 |
|
---|
370 | char c;
|
---|
371 | do {
|
---|
372 | c = (char) fromGSDL.read();
|
---|
373 | response = response + c;
|
---|
374 | } while (!response.endsWith(terminator));
|
---|
375 | toGSDL.close();
|
---|
376 | fromGSDL.close();
|
---|
377 | socket.close();
|
---|
378 |
|
---|
379 | int start = response.indexOf("<?xml");
|
---|
380 | result = response.substring(start);
|
---|
381 | // System.out.println(result);
|
---|
382 | int a = result.indexOf('\n');
|
---|
383 | int b = result.indexOf('\n', a + 1);
|
---|
384 | while (a != -1 && b != -1) {
|
---|
385 | // System.out.println(a + " " +b);
|
---|
386 | result = result.substring(0, a - 1) + result.substring(b + 1);
|
---|
387 | a = result.indexOf('\n');
|
---|
388 | b = result.indexOf('\n', a + 1);
|
---|
389 | }
|
---|
390 | } catch (IOException e) {
|
---|
391 | System.err.println(e);
|
---|
392 | System.exit(1);
|
---|
393 | }
|
---|
394 | return result;
|
---|
395 | }
|
---|
396 |
|
---|
397 | /**
|
---|
398 | * Produces a SOAP request string, sends it to the server, gets and
|
---|
399 | * processes the response updating the appropriate data structures. Uses the
|
---|
400 | * settings represented in the provided argument to produce a SOAP request
|
---|
401 | * string. The string is sent to the server using the {@link doRequest}
|
---|
402 | * method. The returned XML document is processed and the information
|
---|
403 | * therein is used to store information about the returned documents and
|
---|
404 | * this query.
|
---|
405 | * <p>
|
---|
406 | * This method updates the {@link queryList} and {@link allResults} data
|
---|
407 | * <p>
|
---|
408 | *
|
---|
409 | * @param query
|
---|
410 | * a {@link Query} object that must be constructed and passed to
|
---|
411 | * this method by the calling client application
|
---|
412 | * @return a {@link QueryOutcome} object that stores information about the
|
---|
413 | * server's response
|
---|
414 | *
|
---|
415 | */
|
---|
416 | public QueryOutcome issueQueryToServer(Query query) {
|
---|
417 | QueryOutcome queryOutcome = new QueryOutcome();
|
---|
418 | String result = null;
|
---|
419 | String requestBody = "<paramList><param name='maxDocs' value='MAXDOCS'/><param name='level' value='Sec'/><param name ='index' value='INDEX'/><param name='matchMode' value='MATCHMODE'/><param name='query' value='QUERY'/><param name='case' value='CASE'/><param name='sortBy' value='SORTBY'/><param name='stem' value='STEM'/><param name='firstDoc' value='FIRSTDOC'/><param name='lastDoc' value='LASTDOC'/></paramList>";
|
---|
420 | requestBody = requestBody.replaceFirst("MAXDOCS", query
|
---|
421 | .getMaxDocsToReturn());
|
---|
422 | requestBody = requestBody.replaceFirst("INDEX", query.getIndex());
|
---|
423 | requestBody = requestBody.replaceFirst("MATCHMODE", query
|
---|
424 | .getMatchMode());
|
---|
425 | requestBody = requestBody.replaceFirst("QUERY", query.getQueryText());
|
---|
426 | requestBody = requestBody.replaceFirst("CASE", query.getCasefolding());
|
---|
427 | requestBody = requestBody.replaceFirst("SORTBY", query.getSortBy());
|
---|
428 | requestBody = requestBody.replaceFirst("STEM", query.getStemming());
|
---|
429 | requestBody = requestBody.replaceFirst("FIRSTDOC", query.getFirstDoc());
|
---|
430 | requestBody = requestBody.replaceFirst("LASTDOC", query.getLastDoc());
|
---|
431 | String request = SOAPrequestMessage.replaceFirst("PROCESSNAME",
|
---|
432 | "TextQuery");
|
---|
433 | request = request.replaceFirst("PROCESSTYPE", "process");
|
---|
434 | request = request.replaceFirst("REQUESTBODY", requestBody);
|
---|
435 | request = SOAPrequestHeader.replaceFirst("PROCESSNAME", "TextQuery")
|
---|
436 | + request.length() + "\n\n" + request;
|
---|
437 |
|
---|
438 | int firstDoc = java.lang.Integer.parseInt(query.getFirstDoc());
|
---|
439 |
|
---|
440 | result = doRequest(request);
|
---|
441 | // System.out.println("\n\n" + result + "\n");
|
---|
442 | StringReader sr = new StringReader(result);
|
---|
443 | InputSource is = new InputSource(sr);
|
---|
444 |
|
---|
445 | DOMParser p = new DOMParser();
|
---|
446 | try {
|
---|
447 | p.parse(is);
|
---|
448 | } catch (SAXException se) {
|
---|
449 | System.err.println(se);
|
---|
450 | } catch (IOException ioe) {
|
---|
451 | System.err.println(ioe);
|
---|
452 | }
|
---|
453 | Document d = p.getDocument();
|
---|
454 | NodeList metadataList = d.getElementsByTagName("metadata");
|
---|
455 | for (int i = 0; i < metadataList.getLength(); i++) {
|
---|
456 | Node n = metadataList.item(i);
|
---|
457 | NamedNodeMap nnm = n.getAttributes();
|
---|
458 | Node att = nnm.getNamedItem("name");
|
---|
459 | if (att.getNodeValue().compareTo("numDocsMatched") == 0) {
|
---|
460 | queryOutcome.setHowManyDocsMatched(n.getFirstChild()
|
---|
461 | .getNodeValue());
|
---|
462 | } else if (att.getNodeValue().compareTo("numDocsReturned") == 0) {
|
---|
463 | queryOutcome.setHowManyDocsReturned(n.getFirstChild()
|
---|
464 | .getNodeValue());
|
---|
465 | }
|
---|
466 | }
|
---|
467 |
|
---|
468 | NodeList documentList = d.getElementsByTagName("documentNode");
|
---|
469 | for (int i = 0; i < documentList.getLength(); i++) {
|
---|
470 | Node n = documentList.item(i);
|
---|
471 | NamedNodeMap nnm = n.getAttributes();
|
---|
472 | Node nid = nnm.getNamedItem("nodeID");
|
---|
473 | Node nscore = nnm.getNamedItem("rank");
|
---|
474 | String docID = nid.getFirstChild().getNodeValue();
|
---|
475 | queryOutcome.addResult(docID, firstDoc + i, nscore.getFirstChild()
|
---|
476 | .getNodeValue());
|
---|
477 | }
|
---|
478 | query.addQueryOutcome(queryOutcome);
|
---|
479 | Query q = (Query) query.clone();
|
---|
480 | queryList.add(q);
|
---|
481 |
|
---|
482 | for (int i = 0; i < documentList.getLength(); i++) {
|
---|
483 | Node n = documentList.item(i);
|
---|
484 | NamedNodeMap nnm = n.getAttributes();
|
---|
485 | Node nid = nnm.getNamedItem("nodeID");
|
---|
486 | Node nscore = nnm.getNamedItem("rank");
|
---|
487 | String docID = nid.getFirstChild().getNodeValue();
|
---|
488 |
|
---|
489 | QueryContext queryContext = new QueryContext(firstDoc + i, nscore
|
---|
490 | .getFirstChild().getNodeValue(), q);
|
---|
491 | if (allResults.containsKey(docID)) {
|
---|
492 | ResultDocument resultDocument = allResults.get(docID);
|
---|
493 | resultDocument.incrementFrequencyReturned();
|
---|
494 | resultDocument.addQueryContext(queryContext);
|
---|
495 | allResults.put(docID, resultDocument);
|
---|
496 | } else {
|
---|
497 | ResultDocument resultDocument = new ResultDocument();
|
---|
498 | resultDocument.addQueryContext(queryContext);
|
---|
499 | allResults.put(docID, resultDocument);
|
---|
500 | }
|
---|
501 | }
|
---|
502 | return queryOutcome;
|
---|
503 | }
|
---|
504 |
|
---|
505 | /**
|
---|
506 | * Produces a SOAP request string, sends it to the server, gets and
|
---|
507 | * processes the response updating the appropriate data structures. Given a
|
---|
508 | * document identifier and the name of a metadata item, this method produces
|
---|
509 | * a SOAP request string. The string is sent to the server using the
|
---|
510 | * {@link doRequest} method.
|
---|
511 | * <p>
|
---|
512 | * The request is simply for the values of the given metadata item of the
|
---|
513 | * given document. <b>If the metadata item for the given document has
|
---|
514 | * already been retrieved from the server, the server is NOT contacted
|
---|
515 | * again.</b>
|
---|
516 | * <p>
|
---|
517 | * The returned XML document is processed. The {@link ResultDocument} object
|
---|
518 | * for the document in question is updated with the returned metadata
|
---|
519 | * information, and the {@link allResults} data is consequently updated.
|
---|
520 | * <p>
|
---|
521 | * If the requested metadata is one of Keywords, Authors, Dates, Journals,
|
---|
522 | * Booktitles then the appropriate data structure is updated.
|
---|
523 | * <p>
|
---|
524 | * The method does not return a value. Private data structures are updated
|
---|
525 | * instead. The calling client application should proceed to access document
|
---|
526 | * metadata using the provided methods.
|
---|
527 | * <p>
|
---|
528 | *
|
---|
529 | * @param docID
|
---|
530 | * is a document identifier, in the form returned by the server
|
---|
531 | * and available from a {@link QueryOutcome}
|
---|
532 | * @param metadata
|
---|
533 | * is the metadata field whose value is to be retrieved. Valid
|
---|
534 | * values are
|
---|
535 | * <ul>
|
---|
536 | * <li>Title</li>
|
---|
537 | * <li>Creator (the authors)</li>
|
---|
538 | * <li>Journal</li>
|
---|
539 | * <li>Booktitle</li>
|
---|
540 | * <li>Volume</li>
|
---|
541 | * <li>Number</li>
|
---|
542 | * <li>Editor</li>
|
---|
543 | * <li>Pages</li>
|
---|
544 | * <li>Publisher</li>
|
---|
545 | * <li>Date</li>
|
---|
546 | * <li>Keywords</li>
|
---|
547 | * <li>Abstract</li>
|
---|
548 | * </ul>
|
---|
549 | */
|
---|
550 | public void getDocumentMetadataFromServer(String docID, String metadata) {
|
---|
551 | ResultDocument resultDocument = allResults.get(docID);
|
---|
552 | if (resultDocument.metadataExists(metadata)) {
|
---|
553 | return;
|
---|
554 | }
|
---|
555 |
|
---|
556 | String result = null;
|
---|
557 | String requestBody = "<paramList><param name='metadata' value='METADATAFIELD'/></paramList><documentNodeList><documentNode nodeID='DOCIDVALUE'/></documentNodeList>";
|
---|
558 | requestBody = requestBody.replaceFirst("METADATAFIELD", metadata);
|
---|
559 | requestBody = requestBody.replaceFirst("DOCIDVALUE", docID);
|
---|
560 | String request = SOAPrequestMessage.replaceFirst("PROCESSNAME",
|
---|
561 | "DocumentMetadataRetrieve");
|
---|
562 | request = request.replaceFirst("PROCESSTYPE", "process");
|
---|
563 | request = request.replaceFirst("REQUESTBODY", requestBody);
|
---|
564 |
|
---|
565 | request = SOAPrequestHeader.replaceFirst("PROCESSNAME",
|
---|
566 | "DocumentMetadataRetrieve")
|
---|
567 | + request.length() + "\n\n" + request;
|
---|
568 |
|
---|
569 | result = doRequest(request);
|
---|
570 | StringReader sr = new StringReader(result);
|
---|
571 | InputSource is = new InputSource(sr);
|
---|
572 | DOMParser p = new DOMParser();
|
---|
573 | try {
|
---|
574 | p.parse(is);
|
---|
575 | } catch (SAXException se) {
|
---|
576 | System.err.println(se);
|
---|
577 | } catch (IOException ioe) {
|
---|
578 | System.err.println(ioe);
|
---|
579 | }
|
---|
580 | Document d = p.getDocument();
|
---|
581 | NodeList metadataList = d.getElementsByTagName("metadata");
|
---|
582 | String metadataval = null;
|
---|
583 | if (metadataList.getLength() > 0) {
|
---|
584 | Node n = metadataList.item(0);
|
---|
585 | metadataval = n.getFirstChild().getNodeValue();
|
---|
586 |
|
---|
587 | if (metadata.compareTo("Keywords") == 0) {
|
---|
588 | String[] keywords = metadataval.split(",");
|
---|
589 | for (int i = 0; i < keywords.length; i++) {
|
---|
590 | String s = keywords[i].trim().toLowerCase();
|
---|
591 | resultDocument.addKeyword(s);
|
---|
592 | Set<String> docSet = allKeywords.get(metadataval);
|
---|
593 | if (docSet == null) {
|
---|
594 | docSet = new HashSet<String>();
|
---|
595 | }
|
---|
596 | docSet.add(docID);
|
---|
597 | allKeywords.put(metadataval, docSet);
|
---|
598 | }
|
---|
599 | } else if (metadata.compareTo("Creator") == 0) {
|
---|
600 | String[] authors = metadataval.split("(,)|( and )");
|
---|
601 | // System.err.println(metadataval);
|
---|
602 | for (int i = 0; i < authors.length; i++) {
|
---|
603 | authors[i] = authors[i].trim().toLowerCase();
|
---|
604 | }
|
---|
605 |
|
---|
606 | boolean containsExtraName = authors.length % 2 != 0;
|
---|
607 |
|
---|
608 | for (int i = 0; i + 1 < authors.length; i = i + 2) {
|
---|
609 | String s = authors[i] + ", " + authors[i + 1];
|
---|
610 |
|
---|
611 | // Handle names with jr. in them
|
---|
612 | if (containsExtraName) {
|
---|
613 | if (i + 2 < authors.length
|
---|
614 | && authors[i + 2].contains("jr")) {
|
---|
615 | s += " " + authors[i + 2];
|
---|
616 | i++;
|
---|
617 | }
|
---|
618 | }
|
---|
619 |
|
---|
620 | s = s.replaceAll("[.]", "");
|
---|
621 | // System.err.println(s);
|
---|
622 | resultDocument.addAuthor(s);
|
---|
623 |
|
---|
624 | Set<String> docSet = allAuthors.get(s);
|
---|
625 | if (docSet == null) {
|
---|
626 | docSet = new HashSet<String>();
|
---|
627 | }
|
---|
628 | docSet.add(docID);
|
---|
629 | allAuthors.put(s, docSet);
|
---|
630 | }
|
---|
631 | } else if (metadata.compareTo("Title") == 0) {
|
---|
632 | resultDocument.setTitle(metadataval);
|
---|
633 | } else if (metadata.compareTo("Booktitle") == 0) {
|
---|
634 | resultDocument.setBooktitle(metadataval);
|
---|
635 |
|
---|
636 | Set<String> docSet = allBooktitles.get(metadataval);
|
---|
637 | if (docSet == null) {
|
---|
638 | docSet = new HashSet<String>();
|
---|
639 | }
|
---|
640 | docSet.add(docID);
|
---|
641 | allBooktitles.put(metadataval, docSet);
|
---|
642 | } else if (metadata.compareTo("Date") == 0) {
|
---|
643 | resultDocument.setDate(metadataval.replaceAll("[^0-9]", ""));
|
---|
644 | Set<String> docSet = allDates.get(metadataval);
|
---|
645 | if (docSet == null) {
|
---|
646 | docSet = new HashSet<String>();
|
---|
647 | }
|
---|
648 | docSet.add(docID);
|
---|
649 | allDates.put(metadataval, docSet);
|
---|
650 | } else if (metadata.compareTo("Pages") == 0) {
|
---|
651 | resultDocument.setPages(metadataval);
|
---|
652 | } else if (metadata.compareTo("Journal") == 0) {
|
---|
653 | resultDocument.setJournal(metadataval);
|
---|
654 | Set<String> docSet = allJournals.get(metadataval);
|
---|
655 | if (docSet == null) {
|
---|
656 | docSet = new HashSet<String>();
|
---|
657 | }
|
---|
658 | docSet.add(docID);
|
---|
659 | allJournals.put(metadataval, docSet);
|
---|
660 | } else if (metadata.compareTo("Volume") == 0) {
|
---|
661 | resultDocument.setVolume(metadataval);
|
---|
662 | } else if (metadata.compareTo("Number") == 0) {
|
---|
663 | resultDocument.setNumber(metadataval);
|
---|
664 | } else if (metadata.compareTo("Abstract") == 0) {
|
---|
665 | resultDocument.setAbstract(metadataval);
|
---|
666 | } else if (metadata.compareTo("Editor") == 0) {
|
---|
667 | resultDocument.setEditor(metadataval);
|
---|
668 | } else if (metadata.compareTo("Publisher") == 0) {
|
---|
669 | resultDocument.setPublisher(metadataval);
|
---|
670 | }
|
---|
671 |
|
---|
672 | }
|
---|
673 | allResults.put(docID, resultDocument);
|
---|
674 | }
|
---|
675 |
|
---|
676 | public String getClassifierNodeName(String nodeID) {
|
---|
677 | String result = null;
|
---|
678 | String requestBody = "<paramList><param name='metadata' value='Title'/></paramList><classifierNodeList><classifierNode nodeID='NODEID'/></classifierNodeList>";
|
---|
679 | requestBody = requestBody.replaceFirst("NODEID", nodeID);
|
---|
680 | String request = SOAPrequestMessage.replaceFirst("PROCESSNAME",
|
---|
681 | "ClassifierBrowseMetadataRetrieve");
|
---|
682 | request = request.replaceFirst("PROCESSTYPE", "process");
|
---|
683 | request = request.replaceFirst("REQUESTBODY", requestBody);
|
---|
684 |
|
---|
685 | request = SOAPrequestHeader.replaceFirst("PROCESSNAME",
|
---|
686 | "ClassifierBrowseMetadataRetrieve")
|
---|
687 | + request.length() + "\n\n" + request;
|
---|
688 |
|
---|
689 | // System.err.println(request);
|
---|
690 | result = doRequest(request);
|
---|
691 | // System.err.println(result);
|
---|
692 |
|
---|
693 | StringReader sr = new StringReader(result);
|
---|
694 | InputSource is = new InputSource(sr);
|
---|
695 | DOMParser p = new DOMParser();
|
---|
696 | try {
|
---|
697 | p.parse(is);
|
---|
698 | } catch (SAXException se) {
|
---|
699 | System.err.println(se);
|
---|
700 | } catch (IOException ioe) {
|
---|
701 | System.err.println(ioe);
|
---|
702 | }
|
---|
703 |
|
---|
704 | String returnName = null;
|
---|
705 |
|
---|
706 | Document d = p.getDocument();
|
---|
707 | // Document d = null;
|
---|
708 | NodeList metadataList = d.getElementsByTagName("metadata");
|
---|
709 | for (int i = 0; i < metadataList.getLength(); i++) {
|
---|
710 | Node n = metadataList.item(i);
|
---|
711 | NamedNodeMap nnm = n.getAttributes();
|
---|
712 | Node att = nnm.getNamedItem("name");
|
---|
713 | if (att.getNodeValue().compareTo("Title") == 0) {
|
---|
714 | returnName = n.getFirstChild().getNodeValue();
|
---|
715 | }
|
---|
716 | }
|
---|
717 | return returnName;
|
---|
718 | }
|
---|
719 |
|
---|
720 | public void getClassifierNodes(String rootNode) {
|
---|
721 | String result = null;
|
---|
722 | String requestBody = "<paramList><param name='structure' value='children'/></paramList><classifierNodeList><classifierNode nodeID='CLASSIFIER'/></classifierNodeList>";
|
---|
723 | requestBody = requestBody.replaceFirst("CLASSIFIER", rootNode);
|
---|
724 | String request = SOAPrequestMessage.replaceFirst("PROCESSNAME",
|
---|
725 | "ClassifierBrowse");
|
---|
726 | request = request.replaceFirst("PROCESSTYPE", "process");
|
---|
727 | request = request.replaceFirst("REQUESTBODY", requestBody);
|
---|
728 |
|
---|
729 | request = SOAPrequestHeader.replaceFirst("PROCESSNAME",
|
---|
730 | "ClassifierBrowse")
|
---|
731 | + request.length() + "\n\n" + request;
|
---|
732 |
|
---|
733 | System.err.println(getClassifierNodeName(rootNode));
|
---|
734 | // System.err.print(rootNode + "#");
|
---|
735 |
|
---|
736 | // System.err.println(request);
|
---|
737 | result = doRequest(request);
|
---|
738 | // System.err.println(result);
|
---|
739 |
|
---|
740 | StringReader sr = new StringReader(result);
|
---|
741 | InputSource is = new InputSource(sr);
|
---|
742 | DOMParser p = new DOMParser();
|
---|
743 | try {
|
---|
744 | p.parse(is);
|
---|
745 | } catch (SAXException se) {
|
---|
746 | System.err.println(se);
|
---|
747 | } catch (IOException ioe) {
|
---|
748 | System.err.println(ioe);
|
---|
749 | }
|
---|
750 | Document d = p.getDocument();
|
---|
751 |
|
---|
752 | NodeList childList = d.getElementsByTagName("classifierNode");
|
---|
753 | NodeList documentList = d.getElementsByTagName("documentNode");
|
---|
754 | // System.err.println("\td " + documentList.getLength());
|
---|
755 | // System.err.println("\tc " + childList.getLength());
|
---|
756 |
|
---|
757 | if (childList.getLength() > 0) {
|
---|
758 | for (int i = 0; i < childList.getLength(); i++) {
|
---|
759 | Node n = childList.item(i);
|
---|
760 | NamedNodeMap nnm = n.getAttributes();
|
---|
761 | Node nid = nnm.getNamedItem("nodeID");
|
---|
762 | String nodeID = nid.getFirstChild().getNodeValue();
|
---|
763 |
|
---|
764 | // System.err.println("\tchild : " + nodeID);
|
---|
765 |
|
---|
766 | if (nodeID.compareTo(rootNode) != 0
|
---|
767 | && nodeID.compareTo("2.6.22") != 0) {
|
---|
768 | // System.err.println("\t" + nodeID);
|
---|
769 | getClassifierNodes(nodeID);
|
---|
770 | }
|
---|
771 | }
|
---|
772 | }
|
---|
773 | if (documentList.getLength() > 0)
|
---|
774 | System.out.println(getClassifierNodeName(rootNode) + "#"
|
---|
775 | + documentList.getLength());
|
---|
776 | }
|
---|
777 | }
|
---|