Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .netconfig
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,8 @@
sha = 9a1b07589b9bde93bc12528e9325712a32dec418
etag = b54216ac431a83ce5477828d391f02046527e7f6fffd21da1d03324d352c3efb
weak
[file "src/Web/System/Xml/XmlWrappingReader.cs"]
url = https://github.com/devlooped/catbag/blob/main/System/Xml/XmlWrappingReader.cs
sha = b1f3e12a7107dc81de53fd0a962bd4a149ab1ef7
etag = b2c97f61df993f05a7d6e3627ab10e7933528ad33d91be4ac16323756c522b6b
weak
4 changes: 2 additions & 2 deletions Css.sln → Web.sln
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ VisualStudioVersion = 17.0.31612.314
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tests", "src\Tests\Tests.csproj", "{2E8FE01D-35EB-49E2-B693-490F4ABFDD5D}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Css", "src\Css\Css.csproj", "{442D46AA-C4DC-4AEE-826B-CE98A9C6F837}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Web", "src\Web\Web.csproj", "{442D46AA-C4DC-4AEE-826B-CE98A9C6F837}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{F2D75BE9-587E-46CC-BFE4-D5BA538E325A}"
ProjectSection(SolutionItems) = preProject
.editorconfig = .editorconfig
global.json = global.json
.netconfig = .netconfig
readme.md = readme.md
EndProjectSection
EndProject
Expand Down
Binary file added assets/icon-32.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 8 additions & 0 deletions assets/icon.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
26 changes: 21 additions & 5 deletions readme.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,39 @@
![Icon](https://raw.githubusercontent.com/devlooped/css/main/assets/img/icon-32.png) XLinq to Css
![Icon](https://raw.githubusercontent.com/devlooped/css/main/assets/img/icon-32.png) HTML => XML + CSS with XLinq 🤘
============

[![Version](https://img.shields.io/nuget/vpre/Devlooped.Xml.Css.svg?color=royalblue)](https://www.nuget.org/packages/Devlooped.Xml.Css)
[![Downloads](https://img.shields.io/nuget/dt/Devlooped.Xml.Css.svg?color=green)](https://www.nuget.org/packages/Devlooped.Xml.Css)
[![License](https://img.shields.io/github/license/devlooped/css.svg?color=blue)](https://github.com/devlooped/css/blob/main/license.txt)

Implements CSS selectors for XLinq.
Read HTML as XML and query it with CSS over XLinq.

No need to learn an entirely new object model for a page 🤘.
This makes it the most productive and lean library for web
scraping using the latest and greatest that .NET can offer.

# Usage

```csharp
using Devlooped.Xml.Css;
using System.Xml.Linq;
using Devlooped.Web;

var page = XDocument.Load("page.xhtml")
XDocument page = HtmlDocument.Load("page.html")
IEnumerable<XElement> elements = page.CssSelectElements("div.menuitem");

XElement title = page.CssSelectElement("div[role=alert]");
XElement title = page.CssSelectElement("html head meta[name=title]");
```

By default, `HtmlDocument.Load` will skip non-content elements `script` and
`style`, turn all element names into lower case, and ignore all XML namespaces
(useful when loading XHTML, for example) for easier querying. These options
as well as granular whitespace handling can be configured using the overloads
receiving an `HtmlReaderSettings`.

The underlying parsing is performed by the amazing [SgmlReader](https://www.nuget.org/packages/Microsoft.Xml.SgmlReader)
library by Microsoft's [Chris Lovett](http://lovettsoftware.com/).

## CSS

At the moment, supports the following CSS selector features:

- [Type selector](https://www.w3.org/TR/selectors-3/#type-selectors)
Expand Down
102 changes: 102 additions & 0 deletions src/Tests/HtmlTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
using System.Xml;
using System.Xml.Linq;
using System.Xml.XPath;
using Devlooped.Web;

namespace Devlooped.Tests;

public record HtmlTests(ITestOutputHelper Output)
{
[Fact]
public void Render()
{
var doc = HtmlDocument.Load(File("sample.html"));

Output.WriteLine(doc.ToString());
}

[Fact]
public void ExcludesScriptsByDefault()
{
var doc = HtmlDocument.Load(File("wikipedia.html"));

Assert.Empty(doc.XPathSelectElements("//script"));
}

[Fact]
public void IncludeScriptsExplicitSettings()
{
var doc = HtmlDocument.Load(File("wikipedia.html"), new HtmlReaderSettings());

Assert.NotEmpty(doc.XPathSelectElements("//script"));
}

[Fact]
public void ExcludesStylesByDefault()
{
var doc = HtmlDocument.Load(File("wikipedia.html"));

Assert.Empty(doc.XPathSelectElements("//style"));
}

[Fact]
public void IncludeStylesExplicitSettings()
{
var doc = HtmlDocument.Load(File("wikipedia.html"), new HtmlReaderSettings());

Assert.NotEmpty(doc.XPathSelectElements("//style"));
}

[Fact]
public void ExcludesXmlNamespacesByDefault()
{
var doc = HtmlDocument.Load(File("sample.xhtml"));

Assert.NotEmpty(doc.XPathSelectElements("//h1"));
}

[Fact]
public void IncludeXmlNamespacesExplicitly()
{
var doc = HtmlDocument.Load(File("sample.xhtml"), new HtmlReaderSettings { IgnoreXmlNamespaces = false });
var resolver = new XmlNamespaceManager(new NameTable());
resolver.AddNamespace("xh", "http://www.w3.org/1999/xhtml");

Assert.NotEmpty(doc.XPathSelectElements("//xh:h1", resolver));
// Won't match because the elements will have the XHTML namespace
Assert.Empty(doc.XPathSelectElements("//h1"));
}

[Fact]
public void CanChangeToUpperCaseHtml()
{
var doc = HtmlDocument.Load(File("wikipedia.html"),
new HtmlReaderSettings
{
CaseFolding = Sgml.CaseFolding.ToUpper,
});

// The source has lowercase elements
var central = doc.XPathSelectElement("/HTML/BODY/DIV/H1/SPAN");

Assert.NotNull(central);
}

[Fact]
public void HtmlSettings()
{
var doc = HtmlDocument.Load(File("wikipedia.html"),
new HtmlReaderSettings
{
TextWhitespace = Sgml.TextWhitespaceHandling.TrimBoth,
WhitespaceHandling = WhitespaceHandling.None
});

var central = doc.XPathSelectElement("/html/body/div/h1/span");

// The source contains leading and trailing whitespaces.
Assert.Equal("Wikipedia", central?.Value);
}

string File(string path) => new Uri("file://" + new FileInfo(path).FullName).AbsoluteUri;
}
27 changes: 4 additions & 23 deletions src/Tests/Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<AssemblyName>Devlooped.Tests</AssemblyName>
<RootNamespace>Devlooped</RootNamespace>
<NoWarn>xUnit1013</NoWarn>
</PropertyGroup>

<ItemGroup>
Expand All @@ -17,33 +16,15 @@
<ItemGroup>
<Using Include="Xunit" />
<Using Include="Xunit.Abstractions" />
<Using Include="Devlooped.Xml.Css" />
<Import Include="@(Using)" />
<!--
<AdditionalFiles Include="page.html" SourceItemType="Html" />
<CompilerVisibleItemMetadata Include="AdditionalFiles" MetadataName="SourceItemType" />
-->
<Using Include="Devlooped.Web" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Css\Css.csproj" />
<ProjectReference Include="..\Web\Web.csproj" />
</ItemGroup>

<ItemGroup>
<None Update="page.html" Generator="MSBuild:ForceCompile" CopyToOutputDirectory="PreserveNewest"/>
<None Update="xunit.runner.json" CopyToOutputDirectory="PreserveNewest"/>
<None Update="@(None)" CopyToOutputDirectory="PreserveNewest" />
</ItemGroup>

<PropertyGroup>
<CoreCompileDependsOn>ForceCompile;$(CoreCompileDependsOn)</CoreCompileDependsOn>
</PropertyGroup>

<Target Name="ForceCompile" BeforeTargets="CoreCompile"
Inputs="page.html" Outputs="$(BaseIntermediateOutputPath)page.g.cs">
<WriteLinesToFile File="$(BaseIntermediateOutputPath)page.g.cs" Lines="" />
<ItemGroup>
<Compile Include="$(BaseIntermediateOutputPath)page.g.cs" />
</ItemGroup>
</Target>

</Project>
26 changes: 26 additions & 0 deletions src/Tests/sample.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<title>Wikipedia</title>
<script>
// some script
</script>
<style>
/* some style */
</style>
<meta name="viewport" content="initial-scale=1,user-scalable=yes">
<link rel="shortcut icon" href="/static/favicon/wikipedia.ico">
<link rel="license" href="//creativecommons.org/licenses/by-sa/3.0/">
</head>
<body>
<script>
// more script
</script>
<style>
/* some style */
</style>
<h1>
Wikipedia
</h1>
</body>
</html>
8 changes: 8 additions & 0 deletions src/Tests/sample.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
<body>
<h1>
Wikipedia
</h1>
</body>
</html>
Loading