/**
* $Revision: $
* $Date: $
*
* Copyright (C) 2008 Jive Software. All rights reserved.
*
* This software is published under the terms of the GNU Public License (GPL),
* a copy of which is included in this distribution, or a commercial license
* agreement with Jive.
*/
package org.jivesoftware.openfire.nio;
import org.apache.mina.common.ByteBuffer;
import org.jivesoftware.util.Log;
import java.io.UnsupportedEncodingException;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.ArrayList;
import java.util.List;
/**
* This is a Light-Weight XML Parser.
* It read data from a channel and collect data until data are available in
* the channel.
* When a message is complete you can retrieve messages invoking the method
* getMsgs() and you can invoke the method areThereMsgs() to know if at least
* an message is presents.
*
* @author Daniele Piras
* @author Gaston Dombiak
*/
class XMLLightweightParser {
// Chars that rappresent CDATA section start
protected static char[] CDATA_START = {'<', '!', '[', 'C', 'D', 'A', 'T', 'A', '['};
// Chars that rappresent CDATA section end
protected static char[] CDATA_END = {']', ']', '>'};
// Buffer with all data retrieved
protected StringBuilder buffer = new StringBuilder(1200000);
protected String[] copybuffer = new String[1200000];
// ---- INTERNAL STATUS -------
// Initial status
protected static final int INIT = 0;
// Status used when the first tag name is retrieved
protected static final int HEAD = 2;
// Status used when robot is inside the xml and it looking for the tag conclusion
protected static final int INSIDE = 3;
// Status used when a '<' is found and try to find the conclusion tag.
protected static final int PRETAIL = 4;
// Status used when the ending tag is equal to the head tag
protected static final int TAIL = 5;
// Status used when robot is inside the main tag and found an '/' to check '/>'.
protected static final int VERIFY_CLOSE_TAG = 6;
// Status used when you are inside a parameter
protected static final int INSIDE_PARAM_VALUE = 7;
// Status used when you are inside a cdata section
protected static final int INSIDE_CDATA = 8;
// Status used when you are outside a tag/reading text
protected static final int OUTSIDE = 9;
final String[] sstatus = {"INIT", "", "HEAD", "INSIDE", "PRETAIL", "TAIL", "VERIFY", "INSIDE_PARAM", "INSIDE_CDATA", "OUTSIDE"};
// Current robot status
protected int status = XMLLightweightParser.INIT;
// Index to looking for a CDATA section start or end.
protected int cdataOffset = 0;
// Number of chars that machs with the head tag. If the tailCount is equal to
// the head length so a close tag is found.
protected int tailCount = 0;
// Indicate the starting point in the buffer for the next message.
protected int startLastMsg = 0;
// Flag used to discover tag in the form .
protected boolean insideRootTag = false;
// Object conteining the head tag
protected StringBuilder head = new StringBuilder(5);
// List with all finished messages found.
protected List msgs = new ArrayList();
private int depth = 0;
protected boolean insideChildrenTag = false;
Charset encoder;
CharsetDecoder decoder;
public XMLLightweightParser(String charset) {
decoder = Charset.forName(charset).newDecoder();
}
/*
* true if the parser has found some complete xml message.
*/
public boolean areThereMsgs() {
return (msgs.size() > 0);
}
/*
* @return an array with all messages found
*/
public String[] getMsgs() {
String[] res = new String[msgs.size()];
for (int i = 0; i < res.length; i++) {
res[i] = msgs.get(i);
}
msgs.clear();
return res;
}
/*
* Method that add a message to the list and reinit parser.
*/
protected void foundMsg(String msg) {
// Add message to the complete message list
if (msg != null) {
msgs.add(msg);
}
// Move the position into the buffer
status = XMLLightweightParser.INIT;
tailCount = 0;
cdataOffset = 0;
head.setLength(0);
insideRootTag = false;
insideChildrenTag = false;
depth = 0;
}
public static void main(String[] args) throws Exception
{
XMLLightweightParser x = new XMLLightweightParser();
}
protected int currentPosition = 0;
public XMLLightweightParser() throws Exception
{
encoder = Charset.forName("UTF-8");
decoder = Charset.forName("UTF-8").newDecoder();
String myString = "€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€€jhj";
myString = myString + myString + " ";
int myStringlen = myString.getBytes("UTF-8").length-10;
byte mybyte[] = myString.getBytes("UTF-8");
int myPosition = 0;
int myBufferLen = 10;
ByteBuffer myByteBuffer = ByteBuffer.allocate(myBufferLen +3); /* +3 room for previous incompete char*/
//myByteBuffer.clear();
while(true)
{
System.out.println("Postion = "+ myPosition);
if (myPosition > myStringlen) { break; };
myByteBuffer.clear();
read(myByteBuffer);
for (int i=0; i < myBufferLen; i++)
{
myByteBuffer.put(mybyte[myPosition]);
myPosition++;
}
myByteBuffer.flip();
int cap = myByteBuffer.capacity();
read(myByteBuffer);
if (areThereMsgs()) {
for (String stanza : getMsgs()) {
System.out.println("M = "+ stanza);
}
}
}
}
/*
* Main reading method
*/
public void read(ByteBuffer byteBuffer) throws Exception {
int c1 = byteBuffer.capacity();
int r1= byteBuffer.remaining();
boolean h1 = byteBuffer.hasRemaining();
byteBuffer.compact();
int c2 = byteBuffer.capacity();
int r2= byteBuffer.remaining();
boolean h2 = byteBuffer.hasRemaining();
byteBuffer.compact();
int c3 = byteBuffer.capacity();
int r3= byteBuffer.remaining();
boolean h3 = byteBuffer.hasRemaining();
int len = byteBuffer.remaining();
if (len==0) {return;};
int lastByte1 = (int) byteBuffer.getUnsigned(len - 1);
/* 0xxxxxxx 0-127 */
/* 110xxxxx 10xxxxxx 192-223 128-191 */
/* 1110xxxx 10xxxxxx 10xxxxxx 224-239 128-191 128-191 */
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 240-247 128-191 128-191 128-191 */
/* there are no 5 byte UTF-8 characters, http://tools.ietf.org/html/rfc3629#page-4 */
if ( lastByte1 < 128 ) {
/* lastByte1={0xxxxxxx}
* last byte is complete, "len" bytes can pe parsed */
} else {
if ( lastByte1 >= 192 ) {
/* lastByte1={110xxxxx/1110xxxx/11110xxx}
* lastByte1 is start of new UTF-8 char, "len-1" bytes can pe parsed */
len=len-1;
} else {
/* lastByte1={10xxxxxx} */
int lastByte2 = (int) byteBuffer.getUnsigned(len - 2);
if ( lastByte2 >= 224 ) {
/* lastByte2={1110xxxx/11110xxx}
* lastByte2 is start of new UTF-8 char, "len-2" bytes can pe parsed */
len=len-2;
} else {
/* lastByte2={10xxxxxx}, lastByte1={10xxxxxx} */
int lastByte3 = (int) byteBuffer.getUnsigned(len - 3);
if ( lastByte3 >= 240 ) {
/* lastByte3={11110xxx}
* lastByte3 is start of new UTF-8 char, "len-3" bytes can pe parsed */
len=len-3;
} else {
throw new Exception("Invalid UTF-8 sequence detected");
}
}
}
}
// compact buffer (invalidateBuffer)
if ( (startLastMsg > 0) && (buffer.length() > 0) )
{
String str = buffer.substring(startLastMsg);
buffer.delete(0, buffer.length());
buffer.append(str);
startLastMsg = 0;
currentPosition = buffer.length(); /* TODO */
}
// append byteBuffer to buffer
buffer.append(byteBuffer.getString(len, decoder));
byteBuffer.compact();
// Check that the buffer is not bigger than 1 Megabyte. For security reasons
// we will abort parsing when 1 Mega of queued chars was found.
if (buffer.length() > 1048576) {
throw new Exception("Stopped parsing never ending stanza");
}
// Empty the buffer if it only contains white spaces
if ( (buffer.charAt(0) <= ' ') && (buffer.charAt(buffer.length()-1) <= ' ') && ("".equals(buffer.toString().trim())) ) {
buffer.delete(0, buffer.length());
return;
}
// Robot.
char ch;
for (int i = currentPosition; i < buffer.length(); i++) /* TODO */ {
ch = buffer.charAt(i);
if (status == XMLLightweightParser.TAIL) {
// Looking for the close tag
if (depth < 1 && ch == head.charAt(tailCount)) {
tailCount++;
if (tailCount == head.length()) {
// Close stanza found!
// Calculate the correct start,end position of the message into the buffer
int end = /* buffer.length() - readByte + */ (i + 1);
String msg = buffer.substring(startLastMsg, end);
// Add message to the list
foundMsg(msg);
startLastMsg = end;
}
} else {
tailCount = 0;
status = XMLLightweightParser.INSIDE;
}
} else if (status == XMLLightweightParser.PRETAIL) {
if (ch == XMLLightweightParser.CDATA_START[cdataOffset]) {
cdataOffset++;
if (cdataOffset == XMLLightweightParser.CDATA_START.length) {
status = XMLLightweightParser.INSIDE_CDATA;
cdataOffset = 0;
continue;
}
} else {
cdataOffset = 0;
status = XMLLightweightParser.INSIDE;
}
if (ch == '/') {
status = XMLLightweightParser.TAIL;
depth--;
}
else if (ch == '!') {
// This is a ') {
depth--;
status = XMLLightweightParser.OUTSIDE;
if (depth < 1) {
// Found a tag in the form
int end = /* buffer.length() - readByte + */ (i + 1);
String msg = buffer.substring(startLastMsg, end);
// Add message to the list
foundMsg(msg);
buffer.delete(0,end); /* TODO */
startLastMsg = end;
}
} else if (ch == '<') {
status = XMLLightweightParser.PRETAIL;
insideChildrenTag = true;
} else {
status = XMLLightweightParser.INSIDE;
}
} else if (status == XMLLightweightParser.INSIDE_PARAM_VALUE) {
if (ch == '"') {
status = XMLLightweightParser.INSIDE;
}
} else if (status == XMLLightweightParser.INSIDE_CDATA) {
if (ch == XMLLightweightParser.CDATA_END[cdataOffset]) {
cdataOffset++;
if (cdataOffset == XMLLightweightParser.CDATA_END.length) {
status = XMLLightweightParser.OUTSIDE;
cdataOffset = 0;
}
} else {
cdataOffset = 0;
}
} else if (status == XMLLightweightParser.INSIDE) {
if (ch == XMLLightweightParser.CDATA_START[cdataOffset]) {
cdataOffset++;
if (cdataOffset == XMLLightweightParser.CDATA_START.length) {
status = XMLLightweightParser.INSIDE_CDATA;
cdataOffset = 0;
continue;
}
} else {
cdataOffset = 0;
status = XMLLightweightParser.INSIDE;
}
if (ch == '"') {
status = XMLLightweightParser.INSIDE_PARAM_VALUE;
} else if (ch == '>') {
status = XMLLightweightParser.OUTSIDE;
if (insideRootTag && ("stream:stream>".equals(head.toString()) ||
("?xml>".equals(head.toString())) || ("flash:stream>".equals(head.toString())))) {
// Found closing stream:stream
int end = /* buffer.length() - readByte + */ (i + 1);
// Skip LF, CR and other "weird" characters that could appear
while (startLastMsg < end && '<' != buffer.charAt(startLastMsg)) {
startLastMsg++;
}
String msg = buffer.substring(startLastMsg, end);
foundMsg(msg);
buffer.delete(0,end); /* TODO */
startLastMsg = end;
}
insideRootTag = false;
} else if (ch == '/') {
status = XMLLightweightParser.VERIFY_CLOSE_TAG;
}
} else if (status == XMLLightweightParser.HEAD) {
if (ch == ' ' || ch == '>') {
// Append > to head to allow searching
head.append(">");
if(ch == '>')
status = XMLLightweightParser.OUTSIDE;
else
status = XMLLightweightParser.INSIDE;
insideRootTag = true;
insideChildrenTag = false;
continue;
}
else if (ch == '/' && head.length() > 0) {
status = XMLLightweightParser.VERIFY_CLOSE_TAG;
depth--;
}
head.append(ch);
} else if (status == XMLLightweightParser.INIT) {
if (ch == '<') {
status = XMLLightweightParser.HEAD;
depth = 1;
}
else {
startLastMsg++;
}
} else if (status == XMLLightweightParser.OUTSIDE) {
if (ch == '<') {
status = XMLLightweightParser.PRETAIL;
cdataOffset = 1;
insideChildrenTag = true;
}
}
}
currentPosition = buffer.length(); /* TODO */
if (head.length() > 0 &&
("/stream:stream>".equals(head.toString()) || ("/flash:stream>".equals(head.toString())))) {
// Found closing stream:stream
foundMsg("");
}
}
}