" test 2: r#""# Result: Item { item_type: TEXT, ascii_data:" /> " test 2: r#""# Result: Item { item_type: TEXT, ascii_data:" /> " test 2: r#""# Result: Item { item_type: TEXT, ascii_data:"/>

Parse this custom data format for data type with nested list

41 Views Asked by At

I have this format, the result should be of Item data type,

test 1: "<A \"Test\">"

test 2: r#"<A "Test">"#

Result: Item { item_type: TEXT, ascii_data: Some("Test") }

test 3: <A>

Result: Item { item_type: TEXT, ascii_data: None }

For test 1 and test 2, following code parse, but for type 3, it is failing,

It also contains nested types.

<L 
    <A "Test1">
    <L
        <A>
        <A "Test2">
    >
    <A "Test3">
>

Result:

Item { 
    item_type: LIST,
    sub_items: [
        Item {
            item_type: ASCII,
            ascii_data: "Test1",
        },
        Item {
            item_type: LIST,
            sub_items: [
                Item {
                    item_type: ASCII,
                    ascii_data: None,
                    }
                Item {
                    item_type: ASCII,
                    ascii_data: "Test2",
                    }
                ], 
        },
        Item {
            item_type: ASCII,
            ascii_data: "Test3",
        },
    ],
}

Rust Playgroud link

use nom::{
    branch::alt,
    bytes::complete::{tag, take_until},
    character::complete::multispace0,
    combinator::map,
    IResult,
};

#[derive(Clone, Debug, PartialEq)]
enum ItemType {
    LIST,
    TEXT,
    NONE,
}

#[derive(Clone, Debug, PartialEq)]
struct Item {
    item_type: ItemType,
    sub_items: Option<Vec<Item>>,
    ascii_data: Option<String>,
}

impl Default for Item {
    fn default() -> Self {
        Item {
            item_type: ItemType::NONE,
            sub_items: None,
            ascii_data: None,
        }
    }
}

// Parse string data, it may empty then return none,
fn parse_ascii_data(input: &str) -> IResult<&str, String> {
    let (input, _) = tag("\"")(input)?;
    let (input, ascii_data) = take_until("\"")(input)?;
    let (input, _) = tag("\"")(input)?;

    Ok((input, ascii_data.to_string()))
}

// Parse <A> or <A "string">, if no string then return empty string then return none
fn parse_ascii_item(input: &str) -> IResult<&str, Item> {
    let (input, _) = tag("<A")(input)?;
    let (input, _) = multispace0(input)?;
    let (input, ascii_data) = alt((parse_ascii_data, map(tag("\"\""), |_| "".to_string())))(input)?;
    let (input, _) = tag(">")(input)?;

    Ok((
        input,
        Item {
            item_type: ItemType::TEXT,
            ascii_data: Some(ascii_data),
            ..Default::default()
        },
    ))
}

// Parse <L> or <L <A "string">>, if no string then return empty string then return none
fn parse_list_item(input: &str) -> IResult<&str, Item> {
    let (input, _) = tag("<L")(input)?;
    let (input, _) = multispace0(input)?;
    let (input, sub_items) = alt((parse_ascii_item, map(tag("<>"), |_| Item::default())))(input)?;
    let (input, _) = tag(">")(input)?;

    Ok((
        input,
        Item {
            item_type: ItemType::LIST,
            sub_items: Some(vec![sub_items]),
            ..Default::default()
        },
    ))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_ascii_item() {
        let input = "<A \"Test\">";

        let expected_item = Item {
            item_type: ItemType::TEXT,
            ascii_data: Some("Test".to_string()),
            ..Default::default()
        };

        assert_eq!(parse_ascii_item(input), Ok(("", expected_item)));

        let input = r#"<A "Test">"#;

        let expected_item = Item {
            item_type: ItemType::TEXT,
            ascii_data: Some("Test".to_string()),
            ..Default::default()
        };

        assert_eq!(parse_ascii_item(input), Ok(("", expected_item)));

        assert_eq!(
            parse_ascii_item("<A>"),
            Ok((
                "",
                Item {
                    item_type: ItemType::TEXT,
                    ascii_data: None,
                    ..Default::default()
                }
            ))
        );
    }

    #[test]
    fn test_parse_list_item() {
        let input = "<L <A \"Test\">>";

        let expected_item = Item {
            item_type: ItemType::LIST,
            sub_items: Some(vec![Item {
                item_type: ItemType::TEXT,
                ascii_data: Some("Test".to_string()),
                ..Default::default()
            }]),
            ..Default::default()
        };

        assert_eq!(parse_list_item(input), Ok(("", expected_item)));

        assert_eq!(
            parse_list_item("<L>"),
            Ok((
                "",
                Item {
                    item_type: ItemType::LIST,
                    sub_items: Some(vec![Item::default()]),
                    ..Default::default()
                }
            ))
        );
    }

    #[test]
    fn test_parse_nested_list_item() {
        let input = "<L \n    <A \"Test1\">\n    <L\n        <A \"Test2\">\n    >\n>";

        let expected_item = Item {
            item_type: ItemType::LIST,
            sub_items: Some(vec![Item {
                item_type: ItemType::TEXT,
                ascii_data: Some("Test".to_string()),
                ..Default::default()
            }]),
            ..Default::default()
        };

        assert_eq!(parse_list_item(input), Ok(("", expected_item)));

        assert_eq!(
            parse_list_item("<L>"),
            Ok((
                "",
                Item {
                    item_type: ItemType::LIST,
                    sub_items: Some(vec![Item::default()]),
                    ..Default::default()
                }
            ))
        );
    }
}
1

There are 1 best solutions below

5
vallentin On BEST ANSWER

In parse_ascii_item() you don't want to use alt() you want to use opt(). While in parse_list_item() you actually do want do use alt().

The difference is that alt() executes the parsers one-by-one until one succeeds (if any). While in parse_ascii_item() you want to accept optional ascii_data.

The fixed parse_ascii_item() looks like this:

fn parse_ascii_item(input: &str) -> IResult<&str, Item> {
    let (input, _) = multispace0(input)?;
    let (input, _) = tag("<A")(input)?;
    let (input, _) = multispace0(input)?;
    let (input, ascii_data) = opt(parse_ascii_data)(input)?;
    let (input, _) = multispace0(input)?;
    let (input, _) = tag(">")(input)?;

    Ok((
        input,
        Item {
            item_type: ItemType::TEXT,
            ascii_data,
            ..Default::default()
        },
    ))
}

Now in parse_list_item() we do actually want to use alt(), since we want to accept either parse_ascii_item() or parse_list_item(). Additionally, since we want to accept zero-to-many of them, we also need to wrap it in many0().

The fixed parse_list_item() looks like this:

fn parse_list_item(input: &str) -> IResult<&str, Item> {
    let (input, _) = multispace0(input)?;
    let (input, _) = tag("<L")(input)?;
    let (input, mut sub_items) = many0(|input| {
        let (input, _) = multispace0(input)?;
        alt((parse_ascii_item, parse_list_item))(input)
    })(input)?;
    let (input, _) = multispace0(input)?;
    let (input, _) = tag(">")(input)?;

    if sub_items.is_empty() {
        sub_items.push(Item::default());
    }

    Ok((
        input,
        Item {
            item_type: ItemType::LIST,
            sub_items: Some(sub_items),
            ..Default::default()
        },
    ))
}

Additionally, your test_parse_nested_list_item() is wrong. At least your input doesn't reasonably match the expected_item. So I assume expected_item actually needs to look like this:

let expected_item = Item {
    item_type: ItemType::LIST,
    sub_items: Some(vec![
        Item {
            item_type: ItemType::TEXT,
            ascii_data: Some("Test1".to_string()),
            ..Default::default()
        },
        Item {
            item_type: ItemType::LIST,
            sub_items: Some(vec![Item {
                item_type: ItemType::TEXT,
                ascii_data: Some("Test2".to_string()),
                ..Default::default()
            }]),
            ..Default::default()
        },
    ]),
    ..Default::default()
};

Here's a complete example on Rust Playground.


As an aside, in the future remember to sprinkle multispace0() around, since you allow for optional whitespace in various places:

let (input, _) = multispace0(input)?;

Another aside Rust's string literally allow for newlines. Additionally, instead of escaping \" you can also use raw string literals:

// Before
let input = "<L \n    <A \"Test1\">\n    <L\n        <A \"Test2\">\n    >\n>";

// After:
let input = r#"
    <L
        <A "Test1">
        <L
            <A "Test2">
        >
    >"#;