-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Add BQCorpus Dataset #534
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add BQCorpus Dataset #534
Changes from all commits
a5d4065
5ea7130
2bd6e18
512bdb5
6b67497
ac90af3
3758c50
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,3 +35,4 @@ | |
from .cote import * | ||
from .clue import * | ||
from .nlpcc14_sc import * | ||
from .bq_corpus import * |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import collections | ||
import json | ||
import os | ||
|
||
from paddle.dataset.common import md5file | ||
from paddle.utils.download import get_path_from_url | ||
from paddlenlp.utils.env import DATA_HOME | ||
from . import DatasetBuilder | ||
|
||
__all__ = ['bq_corpus'] | ||
|
||
|
||
class bq_corpus(DatasetBuilder): | ||
""" | ||
bq_corpus | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be terrific if you provide more information, such as task and auther, for your dataset. |
||
""" | ||
lazy = False | ||
URL = "https://dataset-bj.cdn.bcebos.com/qianyan/bq_corpus.zip" | ||
MD5 = "abe6c480b96cb705b4d24bd522848009" | ||
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5')) | ||
SPLITS = { | ||
'train': META_INFO( | ||
os.path.join('bq_corpus', 'train.tsv'), | ||
'd37683e9ee778ee2f4326033b654adb9'), | ||
'dev': META_INFO( | ||
os.path.join('bq_corpus', 'dev.tsv'), | ||
'8a71f2a69453646921e9ee1aa457d1e4'), | ||
'test': META_INFO( | ||
os.path.join('bq_corpus', 'test.tsv'), | ||
'c797995baa248b144ceaa4018b191e52'), | ||
} | ||
|
||
def _get_data(self, mode, **kwargs): | ||
''' Check and download Dataset ''' | ||
default_root = os.path.join(DATA_HOME, self.__class__.__name__) | ||
filename, data_hash = self.SPLITS[mode] | ||
fullname = os.path.join(default_root, filename) | ||
if not os.path.exists(fullname) or (data_hash and | ||
not md5file(fullname) == data_hash): | ||
get_path_from_url(self.URL, default_root, self.MD5) | ||
|
||
return fullname | ||
|
||
def _read(self, filename): | ||
"""Reads data.""" | ||
with open(filename, 'r', encoding='utf-8') as f: | ||
head = None | ||
for line in f: | ||
data = line.strip().split("\t") | ||
if not head: | ||
head = data | ||
else: | ||
texta, textb, label = data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to use "sentence1" and "sentence2" here. |
||
yield {"texta": texta, "textb": textb, "label": label} | ||
|
||
def get_labels(self): | ||
""" | ||
Return labels of the bq_corpus object. | ||
""" | ||
return ["0", "1"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,4 +4,5 @@ h5py | |
colorlog | ||
colorama | ||
seqeval | ||
multiprocess | ||
multiprocess | ||
pre-commit | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why adding this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, actually I haven't see these two lines in my code file till now. Maybe I added these when I was using git bash commands. I'm a greenhand in github so I made mistakes. |
Uh oh!
There was an error while loading. Please reload this page.