initial commit

PaddlePaddle · tianxin1860 · May 31, 2022 · May 18, 2022 · May 18, 2022 · May 18, 2022
commit 026053091f0e4b4fb67131a08196b7e573193bea
diff --git a/applications/doc_vqa/.gitignore b/applications/doc_vqa/.gitignore
@@ -0,0 +1,14 @@
+checkpoints/*
+__pycache__/*
+OCR_process/demo_pics/*
+Rerank/log/*
+Rerank/checkpoints/*
+Rerank/data/*
+Rerank/output/*
+Rerank/__pycache__/*
+Extraction/log/*
+Extraction/checkpoints/*
+Extraction/data/*
+Extraction/output/*
+Extraction/__pycache__/*
+
diff --git a/applications/doc_vqa/Extraction/README b/applications/doc_vqa/Extraction/README
@@ -0,0 +1,34 @@
+# 环境要求
+paddle 2.2.0+，动态图训练和预测
+python 3.7+
+
+# 已有模型测试
+bash run_test.sh saved_checkpoints/checkpoint-48000/
+得到结果：F1=65.04
+
+我在paddlenlp/transformers/layoutxlm/modeling.py实现了各种方式的训练实现，具体参考下面的说明。
+
+# 模型训练（单卡、crf算子：https://github.com/PaddlePaddle/models/blob/develop/dygraph/lac/sequence_labeling.py#L126）
+bash run_train.sh
+模型训练跑到第20个epoch预计效果较好。可以参考日志log/log_acl.txt，保存模型为checkpoint-48000。利用run_test.sh脚本，测试得到F1>=65代表基线模型训练可复现。
+
+# 模型训练（单卡、paddlenlp实现CRF，https://github.com/PaddlePaddle/models/blob/develop/dygraph/lac/sequence_labeling.py#L126）
+参见paddlenlp/transformers/layoutxlm/modeling.py中1048-1074行实现，并且注释掉代码前面的标准CRF算子实现即可。
+
+bash run_train.sh运行即可
+
+# 模型训练（多卡，paddlenlp实现CRF）
+
+bash run_train_multi.sh运行即可
+
+# 存在问题以及期望实现目标
+存在问题：
+1.使用CRF算子实现的，多卡无法运行
+2.使用paddlenlp实现的CRF算子，单卡、多卡都可运行，但是速度比算子慢很多（预计3-5倍），并且效果未进行最终验证
+
+期望实现目标：
+1.CRF能够实现多卡训练、并且效果打平甚至超过F1=65。
+
+可以两种方式：
+1.已有CRF算子优化，支持多卡，并且效果超过F1=65。
+2.现有paddlenlp的CRF实现优化，加速运行，速度和最终效果至少打平CRF算子。
diff --git a/applications/doc_vqa/Extraction/answer.png b/applications/doc_vqa/Extraction/answer.png
diff --git a/applications/doc_vqa/Extraction/change_to_mrc.py b/applications/doc_vqa/Extraction/change_to_mrc.py
@@ -0,0 +1,33 @@
+import sys
+import json
+import numpy as np
+
+def get_top1_from_ranker(path):
+    with open(path, "r", encoding="utf-8") as f:
+        scores = [float(line.strip()) for line in f.readlines()]
+        top_id = np.argmax(scores)
+
+    return top_id
+
+def get_ocr_result_by_id(path, top_id):
+    with open(path, "r", encoding="utf-8") as f:
+        reses = f.readlines()
+        res = reses[top_id]
+    return json.loads(res)
+
+def write_to_file(doc, path):
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(doc, f, ensure_ascii=False)
+        f.write("\n")
+
+if __name__=="__main__":
+    question = sys.argv[1]
+    ranker_result_path = "../Rerank/data/demo.score"
+    ocr_result_path = "../OCR_process/demo_ocr_res.json"
+    save_path = "data/demo_test.json"
+    top_id = get_top1_from_ranker(ranker_result_path)
+    doc = get_ocr_result_by_id(ocr_result_path, top_id)
+    doc["question"] = question
+    doc["img_id"] = str(top_id + 1)
+
+    write_to_file(doc, save_path)