Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix apache waf extract; add test #342

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ jobs:
runs-on: ubuntu-latest
container:
image: ${{ matrix.ckan-image }}
options: --user root
services:
solr:
image: ckan/ckan-solr:${{ matrix.solr-image }}
Expand All @@ -63,7 +64,8 @@ jobs:

- name: Install dependencies (common)
run: |
DEBIAN_FRONTEND=noninteractive apt-get --assume-yes --quiet install \
DEBIAN_FRONTEND=noninteractive apt-get update && \
apt-get --assume-yes --quiet install \
python3-dev \
libxml2-dev \
libxslt1-dev \
Expand All @@ -72,7 +74,7 @@ jobs:
- name: Install dependencies from requirements.txt
run: |
pip install -r requirements.txt
pip install pytest-ckan
pip install -r dev-requirements.txt

- name: Install harvester
run: |
Expand Down
2 changes: 2 additions & 0 deletions ckanext/spatial/harvesters/waf.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,8 @@ def _extract_waf(content, base_url, scraper, results = None, depth=0):
if 'mailto:' in url:
continue
if '..' not in url and url[-1] == '/':
if scraper == 'apache' and url[0] == '/':
continue
new_depth = depth + 1
if depth > 10:
log.info('Max WAF depth reached')
Expand Down
13 changes: 13 additions & 0 deletions ckanext/spatial/tests/waf_extract/html_files/apache-folder.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head>
<title>Index of /apache-folder</title>
</head>
<body>
<h1>Index of /apache-folder</h1>
<pre> <a href="?C=N;O=D">Name</a> <a href="?C=M;O=A">Last modified</a> <a href="?C=S;O=A">Size</a> <a href="?C=D;O=A">Description</a><hr> <a href="/">Parent Directory</a> -
<a href="record-1.xml">record-1.xml</a> 2024-11-07 15:00 356K
<a href="subfolder/">subfolder/</a> 2024-11-12 15:00 -
<hr></pre>
</body></html>
12 changes: 12 additions & 0 deletions ckanext/spatial/tests/waf_extract/html_files/apache-subfolder.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head>
<title>Index of /apache-folder/subfolder</title>
</head>
<body>
<h1>Index of /apache-folder/subfolder</h1>
<pre> <a href="?C=N;O=D">Name</a> <a href="?C=M;O=A">Last modified</a> <a href="?C=S;O=A">Size</a> <a href="?C=D;O=A">Description</a><hr> <a href="/folder/">Parent Directory</a> -
<a href="record-2.xml">record-2.xml</a> 2024-11-07 16:59 182K
<hr></pre>
</body></html>
5 changes: 5 additions & 0 deletions ckanext/spatial/tests/waf_extract/html_files/iis-folder.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<html><head><title>iis.server - /iis-folder/</title></head><body><H1>iis.server - /iis-folder/</H1><hr>

<pre><A HREF="/">[To Parent Directory]</A><br><br> 11/7/2024 7:20 AM &lt;dir&gt; <A HREF="/iis-folder/subfolder/">subfolder</A><br> 11/7/2024 3:00 PM 168 <A HREF="/iis-folder/record-1.xml">record-1.xml</A><br></pre><hr></body></html>


Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<html><head><title>iis.server - /iis-folder/subfolder/</title></head><body><H1>iis.server - /iis-folder/subfolder/</H1><hr>

<pre><A HREF="/iis-folder/">[To Parent Directory]</A><br><br> 11/7/2024 4:59 PM 8958 <A HREF="/iis-folder/subfolder/record-2.xml">record-2.xml</A><br></pre><hr></body></html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

<html>
<head><title>Index of /nginx/</title></head>
<body bgcolor="white">
<h1>Index of /nginx/</h1><hr><pre><a href="../">../</a>
<a href="subfolder/">subfolder/</a> 07-Nov-2024 15:00 -
<a href="record-1.xml">record-1.xml</a> 07-Nov-2024 15:00 364868
</pre><hr></body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

<html>
<head><title>Index of /nginx/subfoler/</title></head>
<body bgcolor="white">
<h1>Index of /nginx/subfolder/</h1><hr><pre><a href="../">../</a>
<a href="record-2.xml">record-2.xml</a> 07-Nov-2024 16:59 186150
</pre><hr></body>
</html>
53 changes: 53 additions & 0 deletions ckanext/spatial/tests/waf_extract/test_waf_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os

from ckanext.spatial.harvesters.waf import _extract_waf

TEST_DIR = os.path.dirname(os.path.abspath(__file__))
HTML_DIR = os.path.join(TEST_DIR, "html_files")

def test_extract_iis(httpserver):

# feed http response with these static html content
with \
open(f"{HTML_DIR}/iis-folder.html", "r") as iis_folder, \
open(f"{HTML_DIR}/nginx-folder.html", "r") as nginx_folder, \
open(f"{HTML_DIR}/apache-folder.html", "r") as apache_folder, \
open(f"{HTML_DIR}/iis-subfolder.html", "r") as iis_subfolder, \
open(f"{HTML_DIR}/nginx-subfolder.html", "r") as nginx_subfolder, \
open(f"{HTML_DIR}/apache-subfolder.html", "r") as apache_subfolder:
iis_folder_content = iis_folder.read()
nginx_folder_content = nginx_folder.read()
apache_folder_content = apache_folder.read()
iis_subfolder_content = iis_subfolder.read()
nginx_subfolder_content = nginx_subfolder.read()
apache_subfolder_content = apache_subfolder.read()

# feed static content when it traverses the subfolder
httpserver.expect_request("/iis-folder/subfolder/").respond_with_data(iis_subfolder_content)
httpserver.expect_request("/nginx-folder/subfolder/").respond_with_data(nginx_subfolder_content)
httpserver.expect_request("/apache-folder/subfolder/").respond_with_data(apache_subfolder_content)

# let it scape, traverse and extract the content
iis_results = _extract_waf(
iis_folder_content,
httpserver.url_for("/iis-folder/"),
"iis"
)

nginx_results = _extract_waf(
nginx_folder_content,
httpserver.url_for("/nginx-folder/"),
"nginx"
)

apache_results = _extract_waf(
apache_folder_content,
httpserver.url_for("/apache-folder/"),
"apache"
)

records_expected = [('record-1.xml', '2024-11-07 15:00:00'), ('record-2.xml', '2024-11-07 16:59:00')]

assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in iis_results])
assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in nginx_results])
assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in apache_results])
3 changes: 3 additions & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pytest-ckan
pytest-httpserver == 1.0.2; python_version < '3.10'
pytest-httpserver; python_version >= '3.10'