Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions paddlenlp/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,4 @@
from .cote import *
from .clue import *
from .nlpcc14_sc import *
from .bq_corpus import *
61 changes: 61 additions & 0 deletions paddlenlp/datasets/bq_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import collections
import json
import os

from paddle.dataset.common import md5file
from paddle.utils.download import get_path_from_url
from paddlenlp.utils.env import DATA_HOME
from . import DatasetBuilder

__all__ = ['bq_corpus']


class bq_corpus(DatasetBuilder):
"""
bq_corpus
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be terrific if you provide more information, such as task and auther, for your dataset.

"""
lazy = False
URL = "https://dataset-bj.cdn.bcebos.com/qianyan/bq_corpus.zip"
MD5 = "abe6c480b96cb705b4d24bd522848009"
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
SPLITS = {
'train': META_INFO(
os.path.join('bq_corpus', 'train.tsv'),
'd37683e9ee778ee2f4326033b654adb9'),
'dev': META_INFO(
os.path.join('bq_corpus', 'dev.tsv'),
'8a71f2a69453646921e9ee1aa457d1e4'),
'test': META_INFO(
os.path.join('bq_corpus', 'test.tsv'),
'c797995baa248b144ceaa4018b191e52'),
}

def _get_data(self, mode, **kwargs):
''' Check and download Dataset '''
default_root = os.path.join(DATA_HOME, self.__class__.__name__)
filename, data_hash = self.SPLITS[mode]
fullname = os.path.join(default_root, filename)
if not os.path.exists(fullname) or (data_hash and
not md5file(fullname) == data_hash):
get_path_from_url(self.URL, default_root, self.MD5)

return fullname

def _read(self, filename):
"""Reads data."""
with open(filename, 'r', encoding='utf-8') as f:
head = None
for line in f:
data = line.strip().split("\t")
if not head:
head = data
else:
texta, textb, label = data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to use "sentence1" and "sentence2" here.

yield {"texta": texta, "textb": textb, "label": label}

def get_labels(self):
"""
Return labels of the bq_corpus object.
"""
return ["0", "1"]
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ h5py
colorlog
colorama
seqeval
multiprocess
multiprocess
pre-commit
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why adding this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, actually I haven't see these two lines in my code file till now. Maybe I added these when I was using git bash commands. I'm a greenhand in github so I made mistakes.