Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
0260530
initial commit
1649759610 May 18, 2022
5f66ab1
modify _calc_img_embeddings to support running without img embedding.
1649759610 May 18, 2022
73acf83
remove commented code
1649759610 May 18, 2022
6b457e8
delete README
1649759610 May 18, 2022
1ab0a3f
refine readme.md
1649759610 May 18, 2022
3a5f203
change question
1649759610 May 18, 2022
c54dc88
modify layoutxlm to support traing without image embedding
1649759610 May 18, 2022
84a215d
modify _calc_img_embeddings in layoutxlm to support training without …
1649759610 May 18, 2022
6e8582b
modify _calc_img_embeddings in layoutxlm to support training without …
1649759610 May 18, 2022
fd0e0ea
Merge branch 'develop' into develop
yingyibiao May 18, 2022
3255d51
refine .gitignore
1649759610 May 18, 2022
b3144e3
refine Rerank with pre-commit
1649759610 May 18, 2022
d3c3a09
refine Extraction with pre-commit
1649759610 May 18, 2022
fde5156
refine code and readme details
1649759610 May 18, 2022
a812206
Merge branch 'PaddlePaddle:develop' into develop
1649759610 May 18, 2022
93af9e0
Merge branch 'develop' of github.com:1649759610/PaddleNLP into develop
1649759610 May 18, 2022
a497049
refine coding
1649759610 May 18, 2022
53cfcf8
refine code style about imports
1649759610 May 18, 2022
812a508
refine README
1649759610 May 18, 2022
18abfa8
set CUDA_VISIBLE_DEVICES 0
1649759610 May 18, 2022
fe63856
refine code style
1649759610 May 18, 2022
ff9d6ee
refine readme
1649759610 May 18, 2022
7218e61
refine readme
1649759610 May 18, 2022
8b28fc2
delete ocr parsing file
1649759610 May 18, 2022
fb26c50
Merge branch 'develop' into develop
May 18, 2022
37d1456
refine readme
1649759610 May 18, 2022
d5f2451
Merge branch 'develop' of github.com:1649759610/PaddleNLP into develop
1649759610 May 18, 2022
8c39aa9
refine readme
1649759610 May 23, 2022
ee0b712
refine Readme
1649759610 May 23, 2022
2f7262a
Merge branch 'PaddlePaddle-develop' into develop
1649759610 May 23, 2022
7e088d0
refine README.md
1649759610 May 23, 2022
eedb2d4
refine readme
1649759610 May 25, 2022
706566d
refine readme
1649759610 May 25, 2022
3e2c425
refnie readme
1649759610 May 25, 2022
8331820
Merge branch 'PaddlePaddle-develop' into develop
1649759610 May 25, 2022
d8f3aaa
refine readme
1649759610 May 25, 2022
fcaa27e
optimize ocr and mrc module
1649759610 May 31, 2022
e8c08d3
Merge branch 'PaddlePaddle:develop' into develop
1649759610 May 31, 2022
312bcb0
Merge branch 'develop' of github.com:1649759610/PaddleNLP into develop
1649759610 May 31, 2022
f0f1edb
refine code style
1649759610 May 31, 2022
a3bb28f
Merge branch 'PaddlePaddle:develop' into develop
1649759610 May 31, 2022
4c6b248
set params with argparse
1649759610 May 31, 2022
f6d85fc
rename max_seq_length to max_seq_len
1649759610 May 31, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
refine Extraction with pre-commit
  • Loading branch information
1649759610 committed May 18, 2022
commit d3c3a09bc10eea554a3e63c6865754d3214f94de
8 changes: 6 additions & 2 deletions applications/doc_vqa/Extraction/change_to_mrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,29 @@
import json
import numpy as np


def get_top1_from_ranker(path):
with open(path, "r", encoding="utf-8") as f:
scores = [float(line.strip()) for line in f.readlines()]
top_id = np.argmax(scores)

return top_id


def get_ocr_result_by_id(path, top_id):
with open(path, "r", encoding="utf-8") as f:
reses = f.readlines()
res = reses[top_id]
return json.loads(res)


def write_to_file(doc, path):
with open(path, "w", encoding="utf-8") as f:
json.dump(doc, f, ensure_ascii=False)
f.write("\n")

if __name__=="__main__":

if __name__ == "__main__":
question = sys.argv[1]
ranker_result_path = "../Rerank/data/demo.score"
ocr_result_path = "../OCR_process/demo_ocr_res.json"
Expand All @@ -30,4 +34,4 @@ def write_to_file(doc, path):
doc["question"] = question
doc["img_id"] = str(top_id + 1)

write_to_file(doc, save_path)
write_to_file(doc, save_path)
193 changes: 104 additions & 89 deletions applications/doc_vqa/Extraction/docvqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@


class DocVQAExample(object):
def __init__(self, q_id,
question,
doc_tokens,
doc_boxes = [],
answer=None,
labels=None,
image=None):
def __init__(self,
q_id,
question,
doc_tokens,
doc_boxes=[],
answer=None,
labels=None,
image=None):
self.q_id = q_id
self.question = question
self.doc_tokens = doc_tokens
Expand All @@ -31,17 +32,18 @@ def __init__(self, q_id,

class DocVQAFeatures(object):
"""A single set of features of data."""

def __init__(self,
unique_id,
q_id,
example_index,
input_ids,
input_mask,
segment_ids,
p_index=None,
image=None,
boxes=None,
label=None):
unique_id,
q_id,
example_index,
input_ids,
input_mask,
segment_ids,
p_index=None,
image=None,
boxes=None,
label=None):
self.unique_id = unique_id
self.q_id = q_id
self.example_index = example_index
Expand All @@ -53,6 +55,7 @@ def __init__(self,
self.image = image
self.label = label


class DocVQA(Dataset):
def __init__(self,
args,
Expand All @@ -75,15 +78,15 @@ def __init__(self,
self.return_attention_mask = return_attention_mask
self.max_seq_len = max_seq_len
self.sample_list = None
self.args = args
self.args = args

if self.pad_token_label_id is None:
self.pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index

self.entities_labels = {'HEADER': 0, 'QUESTION': 1, 'ANSWER': 2}
self.encode_inputs = []
self.docvqa_inputs = self.docvqa_input()

def check_is_max_context(self, doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""

Expand Down Expand Up @@ -113,20 +116,24 @@ def check_is_max_context(self, doc_spans, cur_span_index, position):
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
score = min(num_left_context,
num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index

return cur_span_index == best_span_index

def convert_examples_to_features(self, examples, tokenizer, max_seq_length,
max_span_num,
max_doc_length,
max_query_length,
task=None,
label_list=None,
output_mode=None):
def convert_examples_to_features(self,
examples,
tokenizer,
max_seq_length,
max_span_num,
max_doc_length,
max_query_length,
task=None,
label_list=None,
output_mode=None):

label_map = {label: i for i, label in enumerate(label_list)}
if "[CLS]" in self.tokenizer.get_vocab():
Expand All @@ -147,10 +154,10 @@ def convert_examples_to_features(self, examples, tokenizer, max_seq_length,
all_doc_tokens = example.doc_tokens
all_doc_boxes_tokens = example.doc_boxes

cls_token_box=[0, 0, 0, 0]
sep_token_box=[1000, 1000, 1000, 1000]
pad_token_box=[0, 0, 0, 0]
ques_token_box=[0, 0, 0, 0]
cls_token_box = [0, 0, 0, 0]
sep_token_box = [1000, 1000, 1000, 1000]
pad_token_box = [0, 0, 0, 0]
ques_token_box = [0, 0, 0, 0]

# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
Expand Down Expand Up @@ -198,7 +205,8 @@ def convert_examples_to_features(self, examples, tokenizer, max_seq_length,
p_index.append(0)
for i in range(doc_span.length):
split_token_index = doc_span.start + i
is_max_context = self.check_is_max_context(doc_spans, doc_span_index, split_token_index)
is_max_context = self.check_is_max_context(
doc_spans, doc_span_index, split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
boxes_tokens.append(all_doc_boxes_tokens[split_token_index])
Expand Down Expand Up @@ -240,7 +248,7 @@ def convert_examples_to_features(self, examples, tokenizer, max_seq_length,
spans_input_mask = spans_input_mask[0:max_span_num]
spans_segment_ids = spans_segment_ids[0:max_span_num]
spans_boxes_tokens = spans_boxes_tokens[0:max_span_num]
p_index = p_index[0:512*max_span_num]
p_index = p_index[0:512 * max_span_num]
while len(spans_input_ids) < max_span_num:
tokens = []
boxes_tokens = []
Expand Down Expand Up @@ -281,8 +289,8 @@ def convert_examples_to_features(self, examples, tokenizer, max_seq_length,
labels = ["O"] * (spans_input_ids[0].index(sep_id) + 1) + labels
if len(labels) > 512:
labels = labels[:512]
if len(labels)<512:

if len(labels) < 512:
labels += ["O"] * (512 - len(labels))
assert len(spans_input_ids[0]) == len(labels)

Expand All @@ -291,20 +299,19 @@ def convert_examples_to_features(self, examples, tokenizer, max_seq_length,
if l not in label_map:
label_ids.append(0)
else:
label_ids.append(label_map[l])
label_ids.append(label_map[l])

feature = DocVQAFeatures(
unique_id=unique_id,
q_id=example.q_id,
example_index=example_index,
input_ids=spans_input_ids,
input_mask=spans_input_mask,
segment_ids=spans_segment_ids,
boxes=spans_boxes_tokens,
p_index=p_index,
image=example.image,
label=label_ids,
)
unique_id=unique_id,
q_id=example.q_id,
example_index=example_index,
input_ids=spans_input_ids,
input_mask=spans_input_mask,
segment_ids=spans_segment_ids,
boxes=spans_boxes_tokens,
p_index=p_index,
image=example.image,
label=label_ids, )
features.append(feature)
unique_id += 1
return features
Expand All @@ -314,16 +321,18 @@ def create_examples(self, data):
examples = []
for sample in tqdm(data, total=len(data)):
q_id = sample["id"]
image = sample["id"] # No Use
image = sample["id"] # No Use
question = sample["question"]
doc_tokens = sample["document"]
doc_boxes = sample["document_bbox"]
answer = sample['answer']
# only for the first label
labels = sample['labels'][:480]

x_min, y_min = min(doc_boxes, key=lambda x: x[0])[0], min(doc_boxes, key=lambda x: x[2])[2]
x_max, y_max = max(doc_boxes, key=lambda x: x[1])[1], max(doc_boxes, key=lambda x: x[3])[3]
x_min, y_min = min(doc_boxes, key=lambda x: x[0])[0], min(
doc_boxes, key=lambda x: x[2])[2]
x_max, y_max = max(doc_boxes, key=lambda x: x[1])[1], max(
doc_boxes, key=lambda x: x[3])[3]
width = x_max - x_min
height = y_max - y_min

Expand All @@ -334,42 +343,37 @@ def create_examples(self, data):
scale_x = 1000 / max(width, height)
scale_y = 1000 / max(width, height)

scaled_doc_boxes = [ [
round((b[0] - x_min) * scale_x),
round((b[2] - y_min) * scale_y),
round((b[1] - x_min) * scale_x),
scaled_doc_boxes = [[
round((b[0] - x_min) * scale_x), round(
(b[2] - y_min) * scale_y), round((b[1] - x_min) * scale_x),
round((b[3] - y_min) * scale_y)
] for b in doc_boxes]
] for b in doc_boxes]


for box, oribox in zip(scaled_doc_boxes, doc_boxes):
if box[0] < 0:
print (box, oribox)
if box[2] - box[0] <0:
print (box, oribox)
if box[3] - box[1] <0:
print (box, oribox)
print(box, oribox)
if box[2] - box[0] < 0:
print(box, oribox)
if box[3] - box[1] < 0:
print(box, oribox)
for pos in box:
if pos > 1000:
print (width, height, box, oribox)
print(width, height, box, oribox)

example = DocVQAExample(
q_id=q_id,
image=image,
question=question,
doc_tokens=doc_tokens,
doc_boxes=scaled_doc_boxes,
answer=answer,
labels=labels
)
q_id=q_id,
image=image,
question=question,
doc_tokens=doc_tokens,
doc_boxes=scaled_doc_boxes,
answer=answer,
labels=labels)
examples.append(example)
return examples

def get_label_maps_docvqa(self):

labels = [
"O", "I-ans", "B-ans", "E-ans"
]
labels = ["O", "I-ans", "B-ans", "E-ans"]
label2id_map = {label: idx for idx, label in enumerate(labels)}
return label2id_map

Expand All @@ -384,7 +388,7 @@ def docvqa_input(self):
for index, line in enumerate(f):
data.append(json.loads(line.strip()))

# read the examples from train/test xlm files
# read the examples from train/test xlm files
examples = self.create_examples(data)

# should be configured
Expand All @@ -393,26 +397,37 @@ def docvqa_input(self):
max_doc_length = 512
max_span_num = 1
features = self.convert_examples_to_features(
examples, self.tokenizer,
examples,
self.tokenizer,
max_seq_length=max_length,
max_doc_length=max_doc_length,
max_span_num=max_span_num,
max_query_length=max_query_length,
label_list=["O", "I-ans", "B-ans", "E-ans"],
output_mode=None)

all_input_ids = paddle.to_tensor([f.input_ids for f in features], dtype="int64")
all_input_mask = paddle.to_tensor([f.input_mask for f in features], dtype="int64")
all_segment_ids = paddle.to_tensor([f.segment_ids for f in features], dtype="int64")
all_bboxes = paddle.to_tensor([f.boxes for f in features], dtype="int64")
all_p_index = paddle.to_tensor([f.p_index for f in features], dtype="int64")
all_labels = paddle.to_tensor([f.label for f in features], dtype="int64")
self.sample_list = [np.array(all_input_ids), np.array(all_input_mask), np.array(all_segment_ids), np.array(all_bboxes), np.array(all_labels)]

all_input_ids = paddle.to_tensor(
[f.input_ids for f in features], dtype="int64")
all_input_mask = paddle.to_tensor(
[f.input_mask for f in features], dtype="int64")
all_segment_ids = paddle.to_tensor(
[f.segment_ids for f in features], dtype="int64")
all_bboxes = paddle.to_tensor(
[f.boxes for f in features], dtype="int64")
all_p_index = paddle.to_tensor(
[f.p_index for f in features], dtype="int64")
all_labels = paddle.to_tensor(
[f.label for f in features], dtype="int64")
self.sample_list = [
np.array(all_input_ids), np.array(all_input_mask),
np.array(all_segment_ids), np.array(all_bboxes),
np.array(all_labels)
]

def __getitem__(self, idx):
return self.sample_list[0][idx], self.sample_list[1][idx], self.sample_list[2][idx], self.sample_list[3][idx], self.sample_list[4][idx]
return self.sample_list[0][idx], self.sample_list[1][
idx], self.sample_list[2][idx], self.sample_list[3][
idx], self.sample_list[4][idx]

def __len__(self, ):
return self.sample_list[0].shape[0]


Loading