Git Product home page Git Product logo

unhtml.rs's Introduction

A magic html parser.

Stable Test Rust Docs Crate version Download License: MIT

Table of Contents

Derive Target

struct

Basic Usage

#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
#[html(selector = "#test")]
struct SingleUser {
    #[html(selector = "p:nth-child(1)", attr = "inner")]
    name: String,

    #[html(selector = "p:nth-child(2)", attr = "inner")]
    age: u8,

    #[html(selector = "p:nth-child(3)", attr = "inner")]
    like_lemon: bool,
}

let user = SingleUser::from_html(r#"<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
    </div>
</body>
</html>"#).unwrap();
assert_eq!("Hexilee", &user.name);
assert_eq!(20, user.age);
assert!(user.like_lemon);

Attributes

html

target

derive target or field

specification

#[html(selector = "...", attr = "...", default = ...)]

selector, attr, default or html itself can be unnecessary.

This is valid

#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};


#[derive(FromHtml)]
struct SingleString {
    value: String,
}

selector

target

derive target or field

literal type

string

specification

selector must be a valid css-selector, invalid selector will cause a compile-time panic

// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
#[html(selector = "<>")]
struct SingleUser {}
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::*;

#[derive(FromHtml)]
struct SingleUser {
    #[html(selector = "<>", attr = "inner")]
    name: String,
}

if multi element is selected and field type is not Vec, the first will be chosen

#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
#[html(selector = "a")]
struct Link {
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
}

let link = Link::from_html(r#"
<a href="https://github.com">Github</a> 
<a href="https://google.com">Google</a> 
"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
default behavior

html of its root element

#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
}

let link = Link::from_html(r#"<a href="https://github.com">Github</a>"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);

attr

target

field

literal type

string

specification
  • inner refer to innerHtml
  • any other attr refer to html element attribute
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
}

let link = Link::from_html(r#"<a href="https://github.com">Github</a>"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
default behavior

html of the whole element (not innerHtml!)

#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
    
    source: String,
}

let link = Link::from_html(r#"<a href="https://github.com">Github</a>"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
assert_eq!(r#"<a href="https://github.com">Github</a>"#, &link.source);

default

target

field

literal type

any literal type

specification
  • the same type with field
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct DefaultUser {
    // invoke String::from_html
    #[html(selector = "#non-exist", default = "Hexilee")]
    name: String,

    // invoke u8::from<u8>
    #[html(default = 20)]
    age: u8,

    #[html(default = true)]
    like_lemon: bool,
}

let user = DefaultUser::from_html("<p></p>").unwrap();
assert_eq!("Hexilee", &user.name);
assert_eq!(20, user.age);
assert_eq!(-1000, user.assets);
assert!(user.like_lemon);
  • string
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
}

#[derive(FromHtml)]
struct Website {
    #[html(default = "10")]
    age: u8,

    #[html(default = "<a href='https://github.com'>Github</a>")]
    link: Link,
}

let website = Website::from_html("<p></p>").unwrap();
let link = website.link;
assert_eq!(10u8, website.age);
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
default behavior

return a Err(unhtml::failure::Error) when selected nothing

// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    // no default
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
}

let link = Link::from_html(r#"<a>Github</a>"#).unwrap();

Field Type

any type implemented FromHtml, without generics
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    // no default
    #[html(attr = "href")]
    href: &str,
}
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Website {
    // no default
    #[html(attr = "href")]
    hrefs: std::collections::LinkedList<String>,
}
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Website {
    // no default
    #[html(attr = "href")]
    hrefs: [String],
}
Vec

Should use unhtml::VecFromHtml

extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml, VecFromHtml};

#[derive(FromHtml)]
struct TestUser {
    #[html(selector = "p:nth-child(1)", attr = "inner")]
    name: String,

    #[html(selector = "p:nth-child(2)", attr = "inner")]
    age: u8,

    #[html(selector = "p:nth-child(3)", attr = "inner")]
    like_lemon: bool,
}

#[derive(FromHtml)]
#[html(selector = "#test")]
struct TestUsers {
    #[html(selector = "div")]
    users: Vec<TestUser>,
}

let users = TestUsers::from_html(r#"<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
        <div>
            <p>BigBrother</p>
            <p>21</p>
            <p>false</p>
        </div>
    </div>
</body>
</html>"#).unwrap();
let hexilee = &users.users[0];
let big_brother = &users.users[1];
assert_eq!("Hexilee", &hexilee.name);
assert_eq!(20, hexilee.age);
assert!(hexilee.like_lemon);
assert_eq!("BigBrother", &big_brother.name);
assert_eq!(21, big_brother.age);
assert!(!big_brother.like_lemon);

as the documentation of crate unhtml, if you want Vec<TestUser> straightly, you can just:

extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, VecFromHtml};

#[derive(FromHtml)]
struct TestUser {
    #[html(selector = "p:nth-child(1)", attr = "inner")]
    name: String,

    #[html(selector = "p:nth-child(2)", attr = "inner")]
    age: u8,

    #[html(selector = "p:nth-child(3)", attr = "inner")]
    like_lemon: bool,
}

let users = Vec::<TestUser>::from_html("#test > div", r#"<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
        <div>
            <p>BigBrother</p>
            <p>21</p>
            <p>false</p>
        </div>
    </div>
</body>
</html>"#).unwrap();
let hexilee = &users[0];
let big_brother = &users[1];
assert_eq!("Hexilee", &hexilee.name);
assert_eq!(20, hexilee.age);
assert!(hexilee.like_lemon);
assert_eq!("BigBrother", &big_brother.name);
assert_eq!(21, big_brother.age);
assert!(!big_brother.like_lemon);

Source HTML

with top selector

all source html will be parsed as fragment. The top element is html and there is no DOCTYPE, head or body.

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
    </div>
</body>
</html>

will be parsed as:

<html lang="en">
    <meta charset="UTF-8">
    <title>Title</title>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
    </div>
</html>

and

<p>Hexilee</p>

will be parsed as:

<html>
    <p>Hexilee</p>
</html>    
// panic

extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Document {
    // no default
    #[html(selector = "head")]
    head: String,

    #[html(selector = "body")]
    body: String,
}

let dicument = Document::from_html(r#"<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
    </div>
</body>
</html>"#).unwrap();
without top selector

when derived struct doesn't have top selector, all source html will be parsed as pure fragment. There is no DOCTYPE, html, head or body.

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
    </div>
</body>
</html>

will be parsed as:

<meta charset="UTF-8">
<title>Title</title>
<div id="test">
   <div>
       <p>Hexilee</p>
       <p>20</p>
       <p>true</p>
   </div>
</div>

and

<p>Hexilee</p>

will be parsed as:

<p>Hexilee</p>
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
}

let link = Link::from_html(r#"<a href="https://github.com">Github</a>"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);

unhtml.rs's People

Contributors

barskern avatar hexilee avatar paolobarbolini avatar

Stargazers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

Watchers

 avatar  avatar  avatar  avatar  avatar

unhtml.rs's Issues

unhtml::VecFromHtml missing

This crate looks like it will be very useful to me. Thank you for making it.

I'm struggling to parse multiple items into a Vec.

This is my code (simplified):

#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
// use unhtml::VecFromHtml;

use unhtml::FromHtml;

#[derive(FromHtml, Debug)]
#[html]
struct OperationRaw {
    #[html(selector = "a", attr = "href")]
    href: String,
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let html =
        r#"<html><head></head><body><a href='foo'>foo</a><a href='bar'>bar</a></body></html>"#;
    let operations = Vec::<OperationRaw>::from_html(&html)?;
    println!("{:?}", operations);
    Ok(())
}

I prints out:

[OperationRaw { href: "foo" }]

but not { href: "bar" }.

The documentation says to use unhtml::VecFromHtml, but I get the following error when I try and use it:

4 | use unhtml::VecFromHtml;
  |     ^^^^^^^^-----------
  |     |       |
  |     |       help: a similar name exists in the module: `FromHtml`
  |     no `VecFromHtml` in the root

What's the best way to parse multiple into into a Vec?

Thanks, Chris

Getting an Integer out of a multiline `inner`

Hello,

Currently, getting something like

  #[html(selector = "div", attr = "inner")]
  id: usize,

out of something like

<div>
    1
</div>

will return Err(ParseIntError { kind: InvalidDigit }).
Is there any way to ignore whitespace? Should I just minify the contents before using them?

Thanks

Parse into `Option<T>`

This could be useful for things where an element may be present and have useful info, but is sometimes expected to not be present. My specific use case is ranking of sports teams โ€” I currently have to catch the error and reparse into a different struct.

Implement `FromHtml` in terms of `from_html_ref`, not `from_html`

I've been writing a bunch of hand-rolled implementations for FromHtml, and I often end up writing things of the form:

#[derive(Debug)]
struct Thing {

}

impl FromHtml for Thing {
    fn from_html(html: &str) -> Result<Self, Error> {
        let html = Html::parse_fragment(html);
        Self::from_html_ref(html.root_element())
    }

    fn from_html_ref(html: ElementRef) -> Result<Self, Error> {
        // Do actual work
    }
}

It seems that an ElementRef would be a more natural entry point for hand-rolling this logic, instead of dealing with a &str. The performance-shaped part of my brain also freaks out that the default implementation for from_html_ref is:

    fn from_html_ref(elem_ref: ElementRef) -> Result<Self, Error> {
        Ok(Self::from_html(&elem_ref.html())?)
    }

...because I already have an ElementRef, and the parsing is already done! Why do I need to parse this again? :(

So, my suggestion is this: why not change FromHtml to have no default implementation of from_html_ref, and a default for from_html? Something like this:

    fn from_html(html: &str) -> Result<Self, Error> {
        let html = Html::parse_fragment(html);
        Self::from_html_ref(html.root_element())
    }

    fn from_html_ref(html: ElementRef) -> Result<Self, Error>;

Stable rust supported?

Seems like unhtml_derive doesn't support stable rust? Here is the error log when I compile in Rust V1.41.0

   Compiling hyper-tls v0.4.0
   Compiling reqwest v0.10.0
   Compiling unhtml_derive v0.7.4
error[E0554]: `#![feature]` may not be used on the stable release channel
 --> /Users/wichna/.cargo/registry/src/github.com-1ecc6299db9ec823/unhtml_derive-0.7.4/src/lib.rs:1:1
  |
1 | #![feature(proc_macro_diagnostic)]
  | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

error: aborting due to previous error

For more information about this error, try `rustc --explain E0554`.
error: could not compile `unhtml_derive`.

Selector with comma support

For now, it seems that comma in #[html(selector = "...")] is parsed but not supported. I think it should at least panic as non-supported operation, and of course it's better be implemented. Here's the sample snippet:

use unhtml::{self, FromHtml};
use unhtml_derive::*;

#[derive(FromHtml)]
struct Test {
    #[html(selector = "div,p", attr = "inner")]
    text: String,
}

fn main() {
    let test = Vec::<Test>::from_html(
        r#"
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <title>Title</title>
        </head>
        <body>
            <div>Some text</div>
            <p>Some other text</p>
        </body>
        </html>
        "#,
    )
    .unwrap();

    assert_eq!(test.len(), 2);
    assert_eq!(test[0].text, "Some text");
    assert_eq!(test[1].text, "Some other text");
}

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.