-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Add byt5 Model #1742
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Add byt5 Model #1742
Conversation
|
paddlenlp/datasets/__init__.py
Outdated
from .tweetqa import * | ||
from .xsum import * |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
我们目前重新设计了我们的 datasets,实现了对 HF datasets 的全面兼容,使用方法同 HF 接口,所以目前不需要在代码中新增数据集相关文件。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
好的,我看一下
paddlenlp/datasets/tweetqa.py
Outdated
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import collections | ||
import json | ||
import os | ||
|
||
from paddle.dataset.common import md5file | ||
from paddle.utils.download import get_path_from_url | ||
from paddlenlp.utils.env import DATA_HOME | ||
from . import DatasetBuilder | ||
|
||
__all__ = ['tweetqa'] | ||
|
||
|
||
class tweetqa(DatasetBuilder): | ||
|
||
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5', 'URL')) | ||
SPLITS = { | ||
'train': META_INFO( | ||
os.path.join('train.json'), '4e06fd1cfd5e7f0380499df8cbe17237', | ||
'https://bj.bcebos.com/paddlenlp/datasets/tweetqa/train.json'), | ||
'dev': META_INFO( | ||
os.path.join('dev.json'), '9c39d49d25d5296bdc537409208ddc85', | ||
'https://bj.bcebos.com/paddlenlp/datasets/tweetqa/dev.json'), | ||
'test': META_INFO( | ||
os.path.join('test.json'), '9c39d49d25d5296bdc537409208ddc85', | ||
'https://bj.bcebos.com/paddlenlp/datasets/tweetqa/test.json') | ||
} | ||
|
||
def _get_data(self, mode, **kwargs): | ||
default_root = os.path.join(DATA_HOME, self.__class__.__name__) | ||
filename, data_hash, URL = self.SPLITS[mode] | ||
fullname = os.path.join(default_root, filename) | ||
if not os.path.exists(fullname) or (data_hash and | ||
not md5file(fullname) == data_hash): | ||
get_path_from_url(URL, default_root) | ||
return fullname | ||
|
||
def _read(self, filename, split): | ||
with open(filename, encoding="utf-8") as f: | ||
tweet_qa = json.load(f) | ||
for data in tweet_qa: | ||
yield {"Question": data["Question"], "label": [] if split == "test" else data["Answer"],"Tweet": data["Tweet"],"qid": data["qid"]} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
同上。
paddlenlp/datasets/xsum.py
Outdated
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import collections | ||
import json | ||
import os | ||
|
||
from paddle.dataset.common import md5file | ||
from paddle.utils.download import get_path_from_url | ||
from paddlenlp.utils.env import DATA_HOME | ||
from . import DatasetBuilder | ||
|
||
__all__ = ['xsum'] | ||
|
||
|
||
class xsum(DatasetBuilder): | ||
|
||
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5', 'URL')) | ||
SPLITS = { | ||
'train': META_INFO( | ||
os.path.join('train.json'), '4e06fd1cfd5e7f0380499df8cbe17237', | ||
'https://bj.bcebos.com/paddlenlp/datasets/xsum/train.json'), | ||
'dev': META_INFO( | ||
os.path.join('dev.json'), '9c39d49d25d5296bdc537409208ddc85', | ||
'https://bj.bcebos.com/paddlenlp/datasets/xsum/dev.json'), | ||
'test': META_INFO( | ||
os.path.join('test.json'), '9c39d49d25d5296bdc537409208ddc85', | ||
'https://bj.bcebos.com/paddlenlp/datasets/xsum/test.json') | ||
} | ||
|
||
def _get_data(self, mode, **kwargs): | ||
default_root = os.path.join(DATA_HOME, self.__class__.__name__) | ||
filename, data_hash, URL = self.SPLITS[mode] | ||
fullname = os.path.join(default_root, filename) | ||
if not os.path.exists(fullname) or (data_hash and | ||
not md5file(fullname) == data_hash): | ||
get_path_from_url(URL, default_root) | ||
return fullname | ||
|
||
def _read(self, filename, split): | ||
with open(filename, encoding="utf-8") as f: | ||
xsums = json.load(f) | ||
for data in xsums: | ||
yield {"document": data["document"], "label": data["summary"],"id": data["id"]} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
同上。
@@ -0,0 +1,383 @@ | |||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2022
This Pull Request is stale because it has been open for 60 days with no activity. 当前Pull Request 60天内无活动,被标记为stale。 |
This Pull Request is stale because it has been open for 60 days with no activity. 当前Pull Request 60天内无活动,被标记为stale。 |
This Pull Request is stale because it has been open for 60 days with no activity. 当前Pull Request 60天内无活动,被标记为stale。 |
Add model and datatsets
PR changes
Add new model byt5; Add two datasets tweetqa and xsum
Description
The link of the model byt5-small is https://aistudio.baidu.com/aistudio/datasetdetail/123125 ; The link of the datatset xsum is https://aistudio.baidu.com/aistudio/datasetdetail/122619 ; The link of the dataset tweetqa is https://aistudio.baidu.com/aistudio/datasetdetail/131440 ; Please upload the model and datasets and I will update the link and md5 of these datasets in my code