Skip to content

Commit 06f8d2e

Browse files
author
User
committed
Move rest of weblib stuff to grab.util. Remove weblib deps from setup/pyproject.
1 parent a60a693 commit 06f8d2e

File tree

12 files changed

+156
-21
lines changed

12 files changed

+156
-21
lines changed

grab/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
from __future__ import absolute_import
22

3-
from weblib.logs import default_logging # noqa
4-
53
from grab.base import Grab # noqa
64
from grab.error import ( # noqa
75
DataNotFound,
@@ -11,6 +9,7 @@
119
GrabTimeoutError,
1210
)
1311
from grab.upload import UploadContent, UploadFile # noqa
12+
from grab.util.log import default_logging # noqa
1413

1514
__version__ = "1.0.0"
1615
VERSION_NUMERIC = tuple(map(int, __version__.split(".")))

grab/deprecated.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import six
44
from selection import SelectionNotFoundError
55
from selection.const import UNDEFINED
6-
from weblib.etree import get_node_text
6+
from grab.util.etree import get_node_text
77
from grab.util.text import find_number
88

99
from grab import error

grab/document.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,15 @@
2929
from selection import SelectionNotFoundError, XpathSelector
3030
from six import BytesIO, StringIO
3131
from six.moves.urllib.parse import parse_qs, urljoin, urlsplit
32-
from weblib.files import hashed_path
33-
from weblib.rex import normalize_regexp
34-
from grab.util.text import normalize_space
3532

3633
from grab.cookie import CookieManager
3734
from grab.error import DataNotFound, GrabMisuseError
3835
from grab.unset import UNSET, UnsetType
36+
from grab.util.files import hashed_path
3937
from grab.util.html import decode_entities, find_refresh_url
4038
from grab.util.http import smart_urlencode
39+
from grab.util.rex import normalize_regexp
40+
from grab.util.text import normalize_space
4141
from grab.util.warning import warn
4242

4343
NULL_BYTE = chr(0)

grab/script/crawl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
from argparse import ArgumentParser
55

66
import six
7-
from weblib.files import clear_directory
87

98
from grab.util.config import build_root_config, build_spider_config
109
from grab.util.encoding import make_str
10+
from grab.util.files import clear_directory
1111
from grab.util.log import default_logging
1212
from grab.util.module import load_spider_class
1313

grab/spider/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
import six
1212
from six.moves.queue import Empty, Queue
13-
from weblib import metric
1413

1514
from grab.base import Grab
1615
from grab.error import GrabInvalidUrl
@@ -24,6 +23,7 @@
2423
from grab.spider.task_generator_service import TaskGeneratorService
2524
from grab.stat import Stat
2625
from grab.unset import UNSET
26+
from grab.util.metric import format_traffic_value
2727
from grab.util.misc import camel_case_to_underscore
2828
from grab.util.warning import warn
2929

@@ -425,7 +425,7 @@ def render_stats(self, timing=None):
425425
if "download-size" in self.stat.counters:
426426
out.append(
427427
"Network download: %s"
428-
% metric.format_traffic_value(self.stat.counters["download-size"])
428+
% format_traffic_value(self.stat.counters["download-size"])
429429
)
430430
out.append(
431431
"Queue size: %d" % self.task_queue.size() if self.task_queue else "NA"

grab/util/etree.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import six
2+
3+
from grab.util.text import normalize_space as normalize_space_func
4+
5+
6+
def get_node_text(node, smart=False, normalize_space=True):
7+
"""
8+
Extract text content of the `node` and all its descendants.
9+
10+
In smart mode `get_node_text` insert spaces between <tag><another tag>
11+
and also ignores content of the script and style tags.
12+
13+
In non-smart mode this func just return text_content() of node
14+
with normalized spaces
15+
"""
16+
17+
# If xpath return a attribute value, it value will be string not a node
18+
if isinstance(node, six.string_types):
19+
if normalize_space:
20+
node = normalize_space_func(node)
21+
return node
22+
23+
if smart:
24+
value = " ".join(
25+
node.xpath(
26+
'./descendant-or-self::*[name() != "script" and '
27+
'name() != "style"]/text()[normalize-space()]'
28+
)
29+
)
30+
else:
31+
# If DOM tree was built with lxml.etree.fromstring
32+
# then tree nodes do not have text_content() method
33+
try:
34+
value = node.text_content()
35+
except AttributeError:
36+
value = "".join(node.xpath(".//text()"))
37+
if normalize_space:
38+
value = normalize_space_func(value)
39+
return value

grab/util/files.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import os
2+
import shutil
3+
from hashlib import sha1
4+
5+
from six.moves.urllib.parse import urlsplit
6+
7+
8+
def hashed_path_details(url, ext="jpg", base_dir=None):
9+
_hash = sha1(url).hexdigest()
10+
a, b, tail = _hash[:2], _hash[2:4], _hash[4:]
11+
directory = "%s/%s" % (a, b)
12+
if base_dir is not None:
13+
directory = "%s/%s" % (base_dir, directory)
14+
if ext is not None:
15+
filename = "%s.%s" % (tail, ext)
16+
else:
17+
filename = tail
18+
full_path = "%s/%s" % (directory, filename)
19+
return {
20+
"directory": directory,
21+
"filename": filename,
22+
"full_path": full_path,
23+
}
24+
25+
26+
def hashed_path(url, ext="jpg", base_dir=None):
27+
return hashed_path_details(url, ext=ext, base_dir=base_dir)["full_path"]
28+
29+
30+
def clear_directory(path):
31+
"""
32+
Delete recursively all directories and files in
33+
specified directory.
34+
"""
35+
36+
for root, dirs, files in os.walk(path):
37+
for fname in files:
38+
os.unlink(os.path.join(root, fname))
39+
for _dir in dirs:
40+
shutil.rmtree(os.path.join(root, _dir))

grab/util/log.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,13 @@
44
from io import TextIOBase
55

66

7-
def default_logging(grab_log=None, # '/tmp/grab.log',
8-
network_log=None, # '/tmp/grab.network.log',
9-
level=logging.DEBUG, mode='a',
10-
propagate_network_logger=False):
7+
def default_logging(
8+
grab_log=None, # '/tmp/grab.log',
9+
level=logging.DEBUG,
10+
mode="a",
11+
network_log=None, # '/tmp/grab.network.log',
12+
propagate_network_logger=False,
13+
):
1114
"""
1215
Customize logging output to display all log messages
1316
except grab network logs.
@@ -17,14 +20,14 @@ def default_logging(grab_log=None, # '/tmp/grab.log',
1720

1821
logging.basicConfig(level=level)
1922

20-
network_logger = logging.getLogger('grab.network')
23+
network_logger = logging.getLogger("grab.network")
2124
network_logger.propagate = propagate_network_logger
2225
if network_log:
2326
hdl = logging.FileHandler(network_log, mode)
2427
network_logger.addHandler(hdl)
2528
network_logger.setLevel(level)
2629

27-
grab_logger = logging.getLogger('grab')
30+
grab_logger = logging.getLogger("grab")
2831
if grab_log:
2932
hdl = logging.FileHandler(grab_log, mode)
3033
grab_logger.addHandler(hdl)
@@ -54,18 +57,18 @@ def write(self, data):
5457
self.buf.append(data)
5558

5659
def get_output(self):
57-
return ''.join(self.buf)
60+
return "".join(self.buf)
5861

5962
@contextmanager
6063
def handle_sigint(self):
6164
with self.record():
6265
try:
6366
yield
64-
except Exception: # pylint: disable=broad-except
65-
if 'KeyboardInterrupt' in self.get_output():
67+
except Exception: # pylint: disable=broad-except
68+
if "KeyboardInterrupt" in self.get_output():
6669
raise KeyboardInterrupt
6770
else:
6871
raise
6972
else:
70-
if 'KeyboardInterrupt' in self.get_output():
73+
if "KeyboardInterrupt" in self.get_output():
7174
raise KeyboardInterrupt

grab/util/metric.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
KB = 1024
2+
MB = 1024 * KB
3+
GB = MB * 1024
4+
5+
6+
def in_unit(num, unit):
7+
if unit == "b":
8+
return num
9+
elif unit == "kb":
10+
return round(num / float(KB), 2)
11+
elif unit == "mb":
12+
return round(num / float(MB), 2)
13+
elif unit == "gb":
14+
return round(num / float(GB), 2)
15+
else:
16+
return num
17+
18+
19+
def format_traffic_value(num):
20+
if num < KB:
21+
return "%s B" % in_unit(num, "b")
22+
elif num < MB:
23+
return "%s KB" % in_unit(num, "kb")
24+
elif num < GB:
25+
return "%s MB" % in_unit(num, "mb")
26+
else:
27+
return "%s GB" % in_unit(num, "gb")

grab/util/rex.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import re
2+
3+
import six
4+
5+
REGEXP_CACHE = {}
6+
7+
8+
def cache_regexp(rex, flags=0):
9+
key = (rex, flags)
10+
try:
11+
return REGEXP_CACHE[key]
12+
except KeyError:
13+
obj = re.compile(rex, flags)
14+
# obj.source = rex
15+
REGEXP_CACHE[key] = obj
16+
return obj
17+
18+
19+
def normalize_regexp(regexp, flags=0):
20+
"""
21+
Accept string or compiled regular expression object.
22+
23+
Compile string into regular expression object.
24+
"""
25+
26+
if isinstance(regexp, six.string_types):
27+
return cache_regexp(regexp, flags)
28+
else:
29+
return regexp

0 commit comments

Comments
 (0)