2021-06-08 00:41:15 -04:00
|
|
|
#![allow(clippy::upper_case_acronyms)]
|
|
|
|
|
2021-04-15 00:49:52 -04:00
|
|
|
use anyhow::{bail, Result};
|
|
|
|
|
2021-04-16 01:16:19 -04:00
|
|
|
use crate::stanzafilter::StanzaState::*;
|
2021-04-15 00:49:52 -04:00
|
|
|
use crate::to_str;
|
|
|
|
|
2021-04-16 01:16:19 -04:00
|
|
|
#[derive(Debug)]
|
|
|
|
enum StanzaState {
|
|
|
|
OutsideStanza,
|
|
|
|
StanzaFirstChar,
|
|
|
|
InsideTagFirstChar,
|
|
|
|
InsideTag,
|
2021-04-16 01:57:02 -04:00
|
|
|
InsideAttribute(u8),
|
2021-04-16 01:16:19 -04:00
|
|
|
BetweenTags,
|
|
|
|
ExclamationTag(usize),
|
|
|
|
InsideCDATA,
|
|
|
|
QuestionTag(usize),
|
|
|
|
InsideXmlTag,
|
|
|
|
EndStream,
|
|
|
|
}
|
|
|
|
|
2021-04-15 00:49:52 -04:00
|
|
|
pub struct StanzaFilter {
|
|
|
|
buf_size: usize,
|
|
|
|
pub buf: Vec<u8>,
|
2021-07-28 02:24:08 -04:00
|
|
|
end_of_first_tag: usize,
|
2021-04-15 00:49:52 -04:00
|
|
|
cnt: usize,
|
|
|
|
tag_cnt: usize,
|
2021-04-16 01:16:19 -04:00
|
|
|
state: StanzaState,
|
2021-04-15 00:49:52 -04:00
|
|
|
}
|
|
|
|
|
2021-05-15 00:23:04 -04:00
|
|
|
#[inline(always)]
|
|
|
|
fn checked_sub(i: usize, s: usize) -> Result<usize> {
|
|
|
|
// i.checked_sub(s).ok_or_else(||anyhow::anyhow!("invalid stanza"))
|
|
|
|
if s > i {
|
|
|
|
bail!("invalid stanza")
|
|
|
|
} else {
|
|
|
|
Ok(i - s)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-15 00:49:52 -04:00
|
|
|
impl StanzaFilter {
|
|
|
|
pub fn new(buf_size: usize) -> StanzaFilter {
|
|
|
|
StanzaFilter {
|
|
|
|
buf_size,
|
|
|
|
buf: vec![0u8; buf_size],
|
2021-07-28 02:24:08 -04:00
|
|
|
end_of_first_tag: 0,
|
2021-04-15 00:49:52 -04:00
|
|
|
cnt: 0,
|
|
|
|
tag_cnt: 0,
|
2021-04-16 01:16:19 -04:00
|
|
|
state: OutsideStanza,
|
2021-04-15 00:49:52 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[inline(always)]
|
|
|
|
pub fn current_buf(&mut self) -> &mut [u8] {
|
|
|
|
&mut self.buf[self.cnt..(self.cnt + 1)]
|
|
|
|
}
|
|
|
|
|
|
|
|
#[allow(dead_code)]
|
|
|
|
pub fn process_next_byte(&mut self) -> Result<Option<&[u8]>> {
|
|
|
|
if let Some(idx) = self.process_next_byte_idx()? {
|
|
|
|
return Ok(Some(&self.buf[0..idx]));
|
|
|
|
}
|
|
|
|
Ok(None)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn process_next_byte_idx(&mut self) -> Result<Option<usize>> {
|
|
|
|
let b = self.buf[self.cnt];
|
2021-04-16 01:16:19 -04:00
|
|
|
//print!("b: '{}', cnt: {}, tag_cnt: {}, state: {:?}; ", b as char, self.cnt, self.tag_cnt, self.state);
|
|
|
|
match self.state {
|
|
|
|
OutsideStanza => {
|
|
|
|
if b == b'<' {
|
|
|
|
self.tag_cnt += 1;
|
|
|
|
self.state = StanzaFirstChar;
|
|
|
|
} else {
|
|
|
|
// outside of stanzas, let's ignore all characters except <
|
|
|
|
// prosody does this, and since things do whitespace pings, it's good
|
|
|
|
return Ok(None);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
BetweenTags => {
|
|
|
|
if b == b'<' {
|
|
|
|
self.tag_cnt += 1;
|
|
|
|
self.state = InsideTagFirstChar;
|
2021-04-15 00:49:52 -04:00
|
|
|
}
|
2021-04-16 01:16:19 -04:00
|
|
|
}
|
|
|
|
StanzaFirstChar => match b {
|
|
|
|
b'/' => self.state = EndStream,
|
2021-04-16 01:57:02 -04:00
|
|
|
b'!' | b'>' | b'\'' | b'"' => bail!("illegal stanza: {}", to_str(&self.buf[..(self.cnt + 1)])),
|
2021-04-16 01:16:19 -04:00
|
|
|
b'?' => self.state = QuestionTag(self.cnt + 4), // 4 is length of b"xml "
|
|
|
|
_ => self.state = InsideTag,
|
|
|
|
},
|
|
|
|
InsideTagFirstChar => match b {
|
2021-05-15 00:23:04 -04:00
|
|
|
b'/' => self.tag_cnt = checked_sub(self.tag_cnt, 2)?,
|
2021-04-16 01:16:19 -04:00
|
|
|
b'!' => self.state = ExclamationTag(self.cnt + 7), // 7 is length of b"[CDATA["
|
2021-04-16 01:57:02 -04:00
|
|
|
b'?' | b'>' | b'\'' | b'"' => bail!("illegal stanza: {}", to_str(&self.buf[..(self.cnt + 1)])),
|
2021-04-16 01:16:19 -04:00
|
|
|
_ => self.state = InsideTag,
|
|
|
|
},
|
2021-04-16 01:57:02 -04:00
|
|
|
InsideTag => match b {
|
|
|
|
b'>' => {
|
2021-07-28 02:24:08 -04:00
|
|
|
if self.end_of_first_tag == 0 {
|
|
|
|
self.end_of_first_tag = self.cnt;
|
|
|
|
}
|
2021-04-16 01:16:19 -04:00
|
|
|
if self.buf[self.cnt - 1] == b'/' {
|
|
|
|
// state can't be InsideTag unless we are on at least the second character, so can't go out of range
|
2021-04-15 00:49:52 -04:00
|
|
|
// self-closing tag
|
2021-05-15 00:23:04 -04:00
|
|
|
self.tag_cnt = checked_sub(self.tag_cnt, 1)?;
|
2021-04-15 00:49:52 -04:00
|
|
|
}
|
|
|
|
if self.tag_cnt == 0 {
|
2021-04-16 01:16:19 -04:00
|
|
|
return self.stanza_end();
|
|
|
|
}
|
|
|
|
// now special case <stream:stream ...> which we want to send stand-alone:
|
|
|
|
if self.tag_cnt == 1 && self.buf.len() >= 15 && b"<stream:stream " == &self.buf[0..15] {
|
|
|
|
return self.stanza_end();
|
|
|
|
}
|
|
|
|
self.state = BetweenTags;
|
|
|
|
}
|
2021-04-16 01:57:02 -04:00
|
|
|
b'\'' | b'"' => self.state = InsideAttribute(b),
|
|
|
|
_ => {}
|
|
|
|
},
|
|
|
|
InsideAttribute(end) => {
|
|
|
|
if b == end {
|
|
|
|
self.state = InsideTag;
|
|
|
|
}
|
2021-04-16 01:16:19 -04:00
|
|
|
}
|
|
|
|
QuestionTag(idx) => {
|
|
|
|
if idx == self.cnt {
|
|
|
|
if self.last_equals(b"xml ")? {
|
|
|
|
self.state = InsideXmlTag;
|
|
|
|
} else {
|
|
|
|
bail!("illegal stanza: {}", to_str(&self.buf[..(self.cnt + 1)]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
InsideXmlTag => {
|
|
|
|
if b == b'>' {
|
|
|
|
return self.stanza_end();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ExclamationTag(idx) => {
|
|
|
|
if idx == self.cnt {
|
|
|
|
if self.last_equals(b"[CDATA[")? {
|
|
|
|
self.state = InsideCDATA;
|
2021-05-15 00:23:04 -04:00
|
|
|
self.tag_cnt = checked_sub(self.tag_cnt, 1)?; // cdata not a tag
|
2021-04-16 01:16:19 -04:00
|
|
|
} else {
|
|
|
|
bail!("illegal stanza: {}", to_str(&self.buf[..(self.cnt + 1)]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
InsideCDATA => {
|
|
|
|
if b == b'>' && self.last_equals(b"]]>")? {
|
|
|
|
self.state = BetweenTags;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EndStream => {
|
|
|
|
if b == b'>' {
|
|
|
|
if self.last_equals(b"</stream:stream>")? {
|
2021-07-28 02:24:08 -04:00
|
|
|
if self.end_of_first_tag == 0 {
|
|
|
|
self.end_of_first_tag = self.cnt;
|
|
|
|
}
|
2021-04-16 01:16:19 -04:00
|
|
|
return self.stanza_end();
|
|
|
|
} else {
|
|
|
|
bail!("illegal stanza: {}", to_str(&self.buf[..(self.cnt + 1)]));
|
2021-04-15 00:49:52 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-06-08 00:14:22 -04:00
|
|
|
//trace!("cnt: {}, tag_cnt: {}, state: {:?}", self.cnt, self.tag_cnt, self.state);
|
2021-04-15 00:49:52 -04:00
|
|
|
self.cnt += 1;
|
|
|
|
if self.cnt == self.buf_size {
|
|
|
|
bail!("stanza too big: {}", to_str(&self.buf));
|
|
|
|
}
|
|
|
|
Ok(None)
|
|
|
|
}
|
2021-04-16 01:16:19 -04:00
|
|
|
|
|
|
|
fn stanza_end(&mut self) -> Result<Option<usize>> {
|
|
|
|
let ret = Ok(Some(self.cnt + 1));
|
|
|
|
self.tag_cnt = 0;
|
|
|
|
self.cnt = 0;
|
|
|
|
self.state = OutsideStanza;
|
2021-06-08 00:14:22 -04:00
|
|
|
//trace!("cnt: {}, tag_cnt: {}, state: {:?}", self.cnt, self.tag_cnt, self.state);
|
2021-06-08 00:41:15 -04:00
|
|
|
ret
|
2021-04-16 01:16:19 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
fn last_equals(&self, needle: &[u8]) -> Result<bool> {
|
|
|
|
Ok(needle == self.last_num_bytes(needle.len())?)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn last_num_bytes(&self, num: usize) -> Result<&[u8]> {
|
|
|
|
let num = num - 1;
|
|
|
|
if num <= self.cnt {
|
|
|
|
Ok(&self.buf[(self.cnt - num)..(self.cnt + 1)])
|
|
|
|
} else {
|
|
|
|
bail!("expected {} bytes only have {} bytes", num, (self.cnt + 1))
|
|
|
|
}
|
|
|
|
}
|
2021-04-15 00:49:52 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
// this would be better as an async trait, but that doesn't work yet...
|
|
|
|
pub struct StanzaReader<T>(pub T);
|
|
|
|
|
|
|
|
impl<T: tokio::io::AsyncRead + Unpin> StanzaReader<T> {
|
|
|
|
pub async fn next<'a>(&'a mut self, filter: &'a mut StanzaFilter) -> Result<Option<&'a [u8]>> {
|
|
|
|
use tokio::io::AsyncReadExt;
|
|
|
|
|
|
|
|
loop {
|
|
|
|
let n = self.0.read(filter.current_buf()).await?;
|
|
|
|
if n == 0 {
|
|
|
|
return Ok(None);
|
|
|
|
}
|
|
|
|
if let Some(idx) = filter.process_next_byte_idx()? {
|
|
|
|
return Ok(Some(&filter.buf[0..idx]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-07-28 02:24:08 -04:00
|
|
|
|
|
|
|
pub async fn next_eoft<'a>(&'a mut self, filter: &'a mut StanzaFilter) -> Result<Option<(&'a [u8], usize)>> {
|
|
|
|
use tokio::io::AsyncReadExt;
|
|
|
|
|
|
|
|
loop {
|
|
|
|
let n = self.0.read(filter.current_buf()).await?;
|
|
|
|
if n == 0 {
|
|
|
|
return Ok(None);
|
|
|
|
}
|
|
|
|
if let Some(idx) = filter.process_next_byte_idx()? {
|
|
|
|
let end_of_first_tag = filter.end_of_first_tag;
|
|
|
|
filter.end_of_first_tag = 0;
|
|
|
|
return Ok(Some((&filter.buf[0..idx], end_of_first_tag)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-04-15 00:49:52 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use crate::stanzafilter::*;
|
|
|
|
use std::io::Cursor;
|
|
|
|
|
|
|
|
impl<T: tokio::io::AsyncRead + Unpin> StanzaReader<T> {
|
2022-01-19 01:14:39 -05:00
|
|
|
async fn into_vec(mut self, filter: &mut StanzaFilter) -> Result<Vec<String>> {
|
2021-04-16 01:16:19 -04:00
|
|
|
let mut ret = Vec::new();
|
|
|
|
while let Some(stanza) = self.next(filter).await? {
|
|
|
|
ret.push(to_str(stanza).to_string());
|
|
|
|
}
|
2022-01-19 01:14:39 -05:00
|
|
|
Ok(ret)
|
2021-04-15 00:49:52 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[tokio::test]
|
2021-04-27 02:02:02 -04:00
|
|
|
async fn process_next_byte() -> Result<()> {
|
2021-04-15 00:49:52 -04:00
|
|
|
let mut filter = StanzaFilter::new(262_144);
|
|
|
|
|
2021-04-16 01:16:19 -04:00
|
|
|
assert_eq!(
|
|
|
|
StanzaReader(Cursor::new(
|
|
|
|
br###"
|
|
|
|
<?xml version='1.0'?>
|
|
|
|
<stream:stream xmlns='jabber:server' xmlns:stream='http://etherx.jabber.org/streams' xmlns:db='jabber:server:dialback' version='1.0' to='example.org' from='example.com' xml:lang='en'>
|
|
|
|
<a/><b>inside b before c<c>inside c</c></b></stream:stream>
|
|
|
|
<q>bla<![CDATA[<this>is</not><xml/>]]>bloo</q>
|
2021-04-16 01:57:02 -04:00
|
|
|
<x><![CDATA[ lol</x> ]]></x>
|
|
|
|
<z><x><![CDATA[ lol</x> ]]></x></z>
|
|
|
|
<a a='![CDATA['/>
|
|
|
|
<x a='/>'>This is going to be fun.</x>
|
|
|
|
<z><x a='/>'>This is going to be fun.</x></y>
|
2021-04-16 01:16:19 -04:00
|
|
|
<d></d><e><![CDATA[what]>]]]]></e></stream:stream>
|
|
|
|
"###,
|
|
|
|
))
|
2022-01-19 01:14:39 -05:00
|
|
|
.into_vec(&mut filter)
|
2021-04-16 01:16:19 -04:00
|
|
|
.await?,
|
|
|
|
vec![
|
|
|
|
"<?xml version='1.0'?>",
|
|
|
|
"<stream:stream xmlns='jabber:server' xmlns:stream='http://etherx.jabber.org/streams' xmlns:db='jabber:server:dialback' version='1.0' to='example.org' from='example.com' xml:lang='en'>",
|
|
|
|
"<a/>",
|
|
|
|
"<b>inside b before c<c>inside c</c></b>",
|
|
|
|
"</stream:stream>",
|
|
|
|
"<q>bla<![CDATA[<this>is</not><xml/>]]>bloo</q>",
|
2021-04-16 01:57:02 -04:00
|
|
|
"<x><![CDATA[ lol</x> ]]></x>",
|
|
|
|
"<z><x><![CDATA[ lol</x> ]]></x></z>",
|
|
|
|
"<a a='![CDATA['/>",
|
|
|
|
"<x a='/>'>This is going to be fun.</x>",
|
|
|
|
"<z><x a='/>'>This is going to be fun.</x></y>",
|
2021-04-16 01:16:19 -04:00
|
|
|
"<d></d>",
|
|
|
|
"<e><![CDATA[what]>]]]]></e>",
|
|
|
|
"</stream:stream>",
|
|
|
|
]
|
|
|
|
);
|
2021-04-15 00:49:52 -04:00
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|