From 6e62c121d2c2aa647e3ad112cf4395a2d4927566 Mon Sep 17 00:00:00 2001 From: KaparaNewbie Date: Wed, 20 Jan 2021 14:23:09 +0200 Subject: [PATCH] The need for (convenience of) a function such as intersect_dfs can is demonstrated in this bioinformatics.stackexchange thread: https://bioinformatics.stackexchange.com/questions/9015/how-to-do-bedtools-intersection-using-pandas-alone/15181#15181 Often your workflow is pandas' based, but you want to intersect two dfs. Here intersect_dfs become useful as it takes two dfs as an input and returns the intersection as another df. Given below are three tests for this function. # preparations import pandas as pd df1 = pd.DataFrame({0: ['chr1', 'chr1', 'chr1', 'chr1', 'chr2'], 1: [1, 100, 150, 900, 1], 2: [100, 200, 500, 950, 100], 3: ['feature1', 'feature2', 'feature3', 'feature4', 'feature4'], 4: [0, 0, 0, 0, 0], 5: ['+', '+', '-', '+', '+'], 6: ["remember", "the", "5th", "of", "november"]}) df2 = pd.DataFrame({0: ['chr1', 'chr1'], 1: [155, 800], 2: [200, 901], 4: [0, 0], 5: ['-', '+']}) intersect_kwargs = {"s": True} read_table_names = ["chrom", "start", "end", "name", "score", "strand", "whatever"] other_read_table_kwargs = {"usecols": ["chrom", "start", "end"]} # test 1 intersected_df = intersect_dfs(df1, df2, intersect_kwargs=intersect_kwargs, other_read_table_kwargs=other_read_table_kwargs, read_table_names=read_table_names) expected_df = pd.DataFrame({"chrom": ["chr1", "chr1"], "start": [155, 900], "end": [200, 901]}) assert intersected_df.equals(expected_df) # test 2 intersected_df = intersect_dfs(df1, df2, other_read_table_kwargs=other_read_table_kwargs) expected_df = pd.DataFrame({"chrom": ["chr1", "chr1", "chr1"], "start": [155, 155, 900], "end": [200, 200, 901]}) assert intersected_df.equals(expected_df) # test 3 intersected_df = intersect_dfs(df1, df2, read_table_names=read_table_names) expected_df = pd.DataFrame({"chrom": ["chr1", "chr1", "chr1"], "start": [155, 155, 900], "end": [200, 200, 901], "name": ["feature2", "feature3", "feature4"], "score": [0, 0, 0], "strand": ["+", "-", "+"], "whatever": ["the", "5th", "of"]}) assert intersected_df.equals(expected_df) --- pybedtools/contrib/pandas_utils.py | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 pybedtools/contrib/pandas_utils.py diff --git a/pybedtools/contrib/pandas_utils.py b/pybedtools/contrib/pandas_utils.py new file mode 100644 index 00000000..2cc07c23 --- /dev/null +++ b/pybedtools/contrib/pandas_utils.py @@ -0,0 +1,38 @@ +import pybedtools + + +def intersect_dfs(df1, df2, intersect_kwargs=None, read_table_names=None, other_read_table_kwargs=None): + """ + Intersect two pandas DataFrames using pybedtools.intersect. + + @param df1: file A + @type df1: pandas.DataFrame + @param df2: file B + @type df2: pandas.DataFrame + @param intersect_kwargs: kwargs passed to pybedtools.intersect + @type intersect_kwargs: dict + @param read_table_names: list of column names passed to pandas.read_table (instead of the default ones given by + pybedtools) + @type read_table_names: list[str] + @param other_read_table_kwargs: kwargs passed to pandas.read_table other than `header` and `names` + @type other_read_table_kwargs: dict + @return: intersected_df + @rtype: pandas.DataFrame + """ + bed1 = pybedtools.BedTool.from_dataframe(df1) + bed2 = pybedtools.BedTool.from_dataframe(df2) + + intersect_kwargs = {} if intersect_kwargs is None else intersect_kwargs + + intersected_bed = bed1.intersect(bed2, **intersect_kwargs) + + read_table_kwargs = {} + if read_table_names is not None: + read_table_kwargs["header"] = None + read_table_kwargs["names"] = read_table_names + if other_read_table_kwargs is not None: + read_table_kwargs.update(other_read_table_kwargs) + + intersected_df = intersected_bed.to_dataframe(**read_table_kwargs) + + return intersected_df