{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pathak: Example" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

🐍Load Python libraries

\n", "
" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "import os\n", "from pathlib import Path\n", "import pickle\n", "import os.path\n", "\n", "import requests\n", "import pickle\n", "import re\n", "import json\n", "import typing\n", "\n", "\n", "import networkx as nx\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "import nlu\n", "import nltk\n", "# nltk.download('punkt')\n", "from nltk.tokenize import word_tokenize \n", "from nltk import FreqDist, sent_tokenize, word_tokenize # $ pip install nltk\n", "\n", "from bs4 import BeautifulSoup\n", "\n", "from IPython.display import display\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import plotly.graph_objects as go\n", "import plotly.io as pio\n", "# pio.renderers\n", "pio.renderers.default = \"svg\"\n", "\n", "from itertools import chain\n", "from tqdm.notebook import tqdm, trange\n", "import warnings\n", "import session_info\n", "import time\n", "start = time.time()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

Environment setup: Make sure java is loaded

\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Load Java 8 and Set Java home directory" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "java version \"1.8.0_202\"\n", "Java(TM) SE Runtime Environment (build 1.8.0_202-b08)\n", "Java HotSpot(TM) 64-Bit Server VM (build 25.202-b08, mixed mode)\n" ] } ], "source": [ "# Load Java 8 required by nlu and Pyspark\n", "# !module load Java/1.8.0_202\n", "!java -version" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "# Set Java home directory cheaha\n", "os.environ[\"JAVA_HOME\"] = os.environ[\"JAVA_HOME\"]\n", "os.environ[\"PATH\"] = os.environ[\"JAVA_HOME\"] + \"/bin:\" + os.environ[\"PATH\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "print(os.environ[\"JAVA_HOME\"])\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

Set spark environment varibales

\n", "
" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "os.environ['PYSPARK_PYTHON'] = '/home/nileshkr/.conda/envs/sparknlp/bin/python3.8'\n", "os.environ['PYSPARK_DRIVER_PYTHON'] = '/home/nileshkr/.conda/envs/sparknlp/bin/python3.8'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

Load pretrained model and supporting file

\n", "
" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
documentsentence_embedding_biobert
0The maintenance of the structure and integrity...[-0.1828726977109909, 0.25272125005722046, -0....
1The production of new individuals that contain...[-0.031625792384147644, 0.027806805446743965, ...
2The repair of single strand breaks in DNA.[-0.05223826691508293, -0.028281446546316147, ...
3Repair of such breaks is mediated by the same ...[0.02170223370194435, -0.04093412682414055, -0...
4Catalysis of the hydrolysis of ester linkages ...[-0.00821786466985941, -0.23827922344207764, -...
.........
5614Any process that activates or increases the fr...[0.053574178367853165, -0.080331951379776, -0....
5615Any process that activates or increases the fr...[0.015425608493387699, -0.013072511181235313, ...
5616The chemical reactions and pathways involving ...[0.0855507105588913, -0.05061568692326546, -0....
5617The chemical reactions and pathways resulting ...[0.11358797550201416, -0.13088946044445038, -0...
5618The chemical reactions and pathways resulting ...[0.11483287811279297, -0.15022939443588257, -0...
\n", "

5619 rows × 2 columns

\n", "
" ], "text/plain": [ " document \\\n", "0 The maintenance of the structure and integrity... \n", "1 The production of new individuals that contain... \n", "2 The repair of single strand breaks in DNA. \n", "3 Repair of such breaks is mediated by the same ... \n", "4 Catalysis of the hydrolysis of ester linkages ... \n", "... ... \n", "5614 Any process that activates or increases the fr... \n", "5615 Any process that activates or increases the fr... \n", "5616 The chemical reactions and pathways involving ... \n", "5617 The chemical reactions and pathways resulting ... \n", "5618 The chemical reactions and pathways resulting ... \n", "\n", " sentence_embedding_biobert \n", "0 [-0.1828726977109909, 0.25272125005722046, -0.... \n", "1 [-0.031625792384147644, 0.027806805446743965, ... \n", "2 [-0.05223826691508293, -0.028281446546316147, ... \n", "3 [0.02170223370194435, -0.04093412682414055, -0... \n", "4 [-0.00821786466985941, -0.23827922344207764, -... \n", "... ... \n", "5614 [0.053574178367853165, -0.080331951379776, -0.... \n", "5615 [0.015425608493387699, -0.013072511181235313, ... \n", "5616 [0.0855507105588913, -0.05061568692326546, -0.... \n", "5617 [0.11358797550201416, -0.13088946044445038, -0... \n", "5618 [0.11483287811279297, -0.15022939443588257, -0... \n", "\n", "[5619 rows x 2 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 62 ms, sys: 39.1 ms, total: 101 ms\n", "Wall time: 155 ms\n" ] } ], "source": [ "%%time\n", "\n", "df = pd.read_csv('GO_BP_Ontology_corups_Filtered.tsv', sep=\"\\t\")\n", "\n", "with open('predictions_full_BP.pickle', 'rb') as handle:\n", " predictions = pickle.load(handle)\n", "display(predictions)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 15.9 ms, sys: 5.59 ms, total: 21.5 ms\n", "Wall time: 2.9 s\n" ] } ], "source": [ "%%time\n", "## Calculate dinstance between all pairs of sentences in DF \n", "e_col = 'sentence_embedding_biobert'\n", "stored_model_path = \"pipe_full_BP\"\n", "pipe = nlu.load(path=stored_model_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

Function: Calculation of cosine similarity score.\n", "

" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GONameDefinitionDepthsim_score
0GO:0000002mitochondrial genome maintenance.The maintenance of the structure and integrity...60.658170
1GO:0000003reproduction.The production of new individuals that contain...10.679071
2GO:0000012single strand break repair.The repair of single strand breaks in DNA.60.665612
3GO:0000012single strand break repair.Repair of such breaks is mediated by the same ...60.643082
4GO:0000014single-stranded DNA endodeoxyribonuclease acti...Catalysis of the hydrolysis of ester linkages ...80.631392
..................
5614GO:2001235positive regulation of apoptotic signaling pat...Any process that activates or increases the fr...40.700227
5615GO:2001280positive regulation of unsaturated fatty acid ...Any process that activates or increases the fr...60.688193
5616GO:2001289lipid X metabolic process.The chemical reactions and pathways involving ...40.683334
5617GO:2001294malonyl-CoA catabolic process.The chemical reactions and pathways resulting ...50.644774
5618GO:2001295malonyl-CoA biosynthetic process.The chemical reactions and pathways resulting ...60.640183
\n", "

5619 rows × 5 columns

\n", "
" ], "text/plain": [ " GO Name \\\n", "0 GO:0000002 mitochondrial genome maintenance. \n", "1 GO:0000003 reproduction. \n", "2 GO:0000012 single strand break repair. \n", "3 GO:0000012 single strand break repair. \n", "4 GO:0000014 single-stranded DNA endodeoxyribonuclease acti... \n", "... ... ... \n", "5614 GO:2001235 positive regulation of apoptotic signaling pat... \n", "5615 GO:2001280 positive regulation of unsaturated fatty acid ... \n", "5616 GO:2001289 lipid X metabolic process. \n", "5617 GO:2001294 malonyl-CoA catabolic process. \n", "5618 GO:2001295 malonyl-CoA biosynthetic process. \n", "\n", " Definition Depth sim_score \n", "0 The maintenance of the structure and integrity... 6 0.658170 \n", "1 The production of new individuals that contain... 1 0.679071 \n", "2 The repair of single strand breaks in DNA. 6 0.665612 \n", "3 Repair of such breaks is mediated by the same ... 6 0.643082 \n", "4 Catalysis of the hydrolysis of ester linkages ... 8 0.631392 \n", "... ... ... ... \n", "5614 Any process that activates or increases the fr... 4 0.700227 \n", "5615 Any process that activates or increases the fr... 6 0.688193 \n", "5616 The chemical reactions and pathways involving ... 4 0.683334 \n", "5617 The chemical reactions and pathways resulting ... 5 0.644774 \n", "5618 The chemical reactions and pathways resulting ... 6 0.640183 \n", "\n", "[5619 rows x 5 columns]" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_sim_df_for_string(predictions, e_col, string_to_embed, pipe=pipe):\n", "\n", " \"\"\"\n", " Creates a Dataframe which has a `sim_score` column which describes the similarity with the `string_to_embed` variable.\n", "\n", " Args:\n", " predictions: A DataFrame of predictions.\n", " e_col: The name of the column in `predictions` that contains the embeddings.\n", " string_to_embed: The string to embed.\n", " pipe: A spaCy pipeline object.\n", "\n", " Returns:\n", " A DataFrame with an additional `sim_score` column.\n", " \"\"\"\n", "\n", " # Put predictions vectors in matrix.\n", " embed_mat = np.array([x for x in predictions[e_col]])\n", "\n", " # Embed string input string.\n", " embedding = pipe.predict(string_to_embed).iloc[0]['sentence_embedding_from_disk']\n", "\n", " # Replicate embedding for input string.\n", " m = np.array([embedding,] * len(df))\n", "\n", " # Compute cosine similarity.\n", " sim_mat = cosine_similarity(m, embed_mat)\n", "\n", " # Write sim score.\n", " df['sim_score'] = sim_mat[0]\n", "\n", " # Return df.\n", " return df\n", "\n", "get_sim_df_for_string(predictions, e_col, \"Test test\" )" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "
\n", "

Function: Loading document, xml or PubMed API.

\n", "
" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 232 µs, sys: 155 µs, total: 387 µs\n", "Wall time: 48.4 µs\n" ] } ], "source": [ "%%time\n", "\n", "def query_go(query: str, Top: int = 3, sim_cutoff: float = 0.92, Send: str = \"ALL\") -> typing.Union[list, bool]:\n", "\n", " \"\"\"\n", " Queries the Gene Ontology (GO) database for terms that are similar to the given query.\n", "\n", " Args:\n", " query: The query string.\n", " Top: The number of top results to return.\n", " sim_cutoff: The similarity cutoff.\n", " Send: The type of results to send.\n", "\n", " Returns:\n", " A list of GO terms that are similar to the query, or False if no results were found.\n", " \"\"\"\n", "\n", " # Get the similarity matrix between the query and the GO terms.\n", " sim_df = get_sim_df_for_string(predictions, e_col, query)\n", "\n", " # Sort the similarity matrix by similarity score.\n", " df_res = sim_df.sort_values('sim_score', ascending=False).iloc[:Top]\n", "\n", " # Filter the results by similarity score.\n", " df_res = df_res.loc[df_res['sim_score'] > sim_cutoff]\n", "\n", " # If no results were found, return False.\n", " if not len(df_res):\n", " return False\n", "\n", " # Return the results.\n", " if Send == \"ALL\":\n", " return [tuple(i) for i in (df_res[[\"GO\", \"Name\", \"Definition\", \"Depth\", \"sim_score\"]].values.tolist())]\n", " elif Send == \"GO\":\n", " return [tuple(i) for i in (df_res[[\"GO\", \"sim_score\"]].values.tolist())]\n", " elif Send == \"Name\":\n", " return [tuple(i) for i in (df_res[[\"Name\", \"sim_score\"]].values.tolist())]\n", " elif Send == \"Def\":\n", " return [tuple(i) for i in (df_res[[\"Definition\", \"sim_score\"]].values.tolist())]\n", " elif Send == \"Depth\":\n", " return [tuple(i) for i in (df_res[[\"Depth\", \"sim_score\"]].values.tolist())]\n", " else:\n", " raise ValueError(\"Invalid value for `Send`.\")\n", " \n", "# query_go('Gonadoblastoma is defined as an abnormal growth of germ cells and sex cord elements on the gonad.')\n", "# query_go(' regulate signal transduction events generated by G-protein coupled receptors', sim_cutoff=0.80)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

Function: Lower case and uppercase lambda function

\n", " The given Python code defines two lambda functions: Lowerset and Upperset. Both functions take a set as input, convert all elements to lowercase or uppercase respectively, and return a new set containing the transformed elements.\n", "
\n" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "# Define a lambda function LowerSet that transforms all elements in a set to lowercase\n", "Lowerset = lambda Set: set([i.lower() for i in Set])\n", "\n", "# Define a lambda function UpperSet that transforms all elements in a set to uppercase\n", "Upperset = lambda Set: set([i.upper() for i in Set])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

Function: Loading Gene list from disk

\n", " The code reads a list of gene names from a file (\"1.2_ATH_Gene_all.lst\"), removes certain terms from the list, converts the list to a set, and then converts the set to a list of uppercase gene IDs.\n", "
" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total Genes in Human : 33651 \n", " ['ATPEX12', 'ATPMEI1', 'COR413-TM1', 'AT3G61440', 'GA3OX2']\n", "Total IDs in ATH : 33589 \n", " ['TAF6B1', 'HAP2C', 'AT3G10700', 'AG', 'ATARI11']\n" ] } ], "source": [ "# Read gene names from file and create a list of unique gene names\n", "Gene_list = list(set([i for i in open(\"Auxiliary_Files/1.2_ATH_Gene_all.lst\").read().splitlines() if len(i)]))\n", "\n", "# Print the number of genes and the first five genes in the list\n", "print(f\"Total Genes in Human : {len(Gene_list)} \\n {Gene_list[:5]}\")\n", "\n", "# Remove certain terms from the list of genes\n", "Gene_list = set(Gene_list) - set([\"AND\", \"AN\", \"CAN\", \"NOT\", \"HAS\", \"FACT\", \n", " \"MIN\", \"LATE\", \"MAIN\", \"RING\", \"BP\", \n", " \"POLAR\", \"CO\", \"NIH\", \"LAB\", \"GI\", \"NSF\", \n", " \"WAS\", \"MICE\"])\n", "\n", "# Convert the set of genes to a list, and then convert it to uppercase using the `Upperset` function\n", "Gene_list = Upperset(set(list(Gene_list)))\n", "\n", "# Print the number of gene IDs and the last five gene IDs in the list\n", "print(f\"Total IDs in ATH : {len(Gene_list)} \\n {list(Gene_list)[-5:]}\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

Function: To check if paragraph contains any of the Arabidopsis thaliana genes

\n", " The code reads a list of gene names from a file (\"1.2_ATH_Gene_all.lst\"), removes certain terms from the list, converts the list to a set, and then converts the set to a list of uppercase gene IDs.\n", "
" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 95 µs, sys: 63 µs, total: 158 µs\n", "Wall time: 24.8 µs\n" ] } ], "source": [ "%%time\n", "\n", "def para_pass(text: str, Print_list: bool = False):\n", " \"\"\"\n", " Takes a text paragraph and returns a boolean value indicating whether or not any genes were found in the text.\n", "\n", " Args:\n", " text: The text paragraph to analyze.\n", " Print_list: If True, the list of genes found in the text will be printed.\n", "\n", " Returns:\n", " A boolean value indicating whether or not any genes were found in the text.\n", " \"\"\"\n", "\n", " # Tokenize the text into words.\n", " words = chain.from_iterable(map(word_tokenize, sent_tokenize(text)))\n", "\n", " # Create a frequency distribution of the words.\n", " freq = FreqDist(map(str.casefold, words))\n", "\n", " # Convert the frequency distribution to a set of words.\n", " word_bag = Upperset(freq)\n", "\n", " # Intersect the set of words with the set of genes.\n", " intersection = word_bag.intersection(Gene_list)\n", "\n", " # If the intersection is not empty, return True.\n", " if intersection:\n", " if Print_list:\n", " print(intersection)\n", " return intersection\n", "\n", " # Otherwise, return False.\n", " return False" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

Function: To plot summary table

\n", " This is a Python function named Plot_data that plots the data in the given Data dictionary. It takes in one argument, a dictionary called Data, and returns a networkx graph of the data. \n", "
" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "def Plot_data(Data) -> nx.Graph:\n", " \"\"\"\n", " Plots the data in the given `Data` dictionary.\n", "\n", " Args:\n", " Data: A dictionary containing the following keys:\n", " * `Meta_info`: A dictionary containing the metadata for the paper.\n", " * `Search_info`: A list of dictionaries, each of which contains the results of a search for a gene.\n", "\n", " Returns:\n", " A networkx graph of the data.\n", " \"\"\"\n", "\n", " # Get the metadata for the paper.\n", " Meta = Data[0][\"Meta_info\"]\n", " #print(Meta)\n", " values = [['Paper Title', 'Species', 'pmid', 'PMC', 'DOI'],\n", " ['' + Meta['Title'] + \"\", Meta['Species'], Meta['pmid'], Meta['pmc'], Meta['DOI']]]\n", "\n", " fig = go.Figure(data=[go.Table(\n", " columnorder = [1,2],\n", " columnwidth = [100,400],\n", " header = dict(\n", " values = [['Article information'],\n", " ['Description']],\n", " line_color='darkslategray',\n", " fill_color='#144B39',\n", " align=['left','center'],\n", " font=dict(color='white', size=12),\n", " height=20\n", " ),\n", " cells=dict(\n", " values=values,\n", " line_color='darkslategray',\n", " fill=dict(color=['#FFD400', 'white']),\n", " align=['left', 'left'],\n", " font_size=12,\n", " height=20)\n", " )\n", " ])\n", "\n", " fig.update_layout(width=800, height=175)\n", " fig.update_layout(margin = dict(t=0, l=5, r=0, b=0))\n", " fig.show()\n", " ########################\n", "\n", " ########################\n", " Dic_list = []\n", "\n", " for i in range(1, len(Data)-1):\n", " #print(i)\n", " Res = Data[i][\"Search_info\"]\n", "\n", " if len(Res):\n", " for s in Res:\n", " for g in Data[i][\"Genes\"].split(\";\"):\n", " Dic = {\"Genes\":g, \"GO\":s[\"GO\"], \"Name\":s[\"GO_Name\"], \"Score\":s[\"Score\"], \"Depth\":s[\"Depth\"]}\n", " Dic_list.append(Dic)\n", "\n", " df = pd.DataFrame(Dic_list)\n", " df_uniq = df.drop_duplicates(keep=\"first\")\n", " if df_uniq.empty:\n", " print('No Human genes found or GO mapped!!\\n')\n", " return\n", "\n", "\n", " ######################\n", " # Plot Bipartite Graph\n", " ######################\n", " G = nx.from_pandas_edgelist(df_uniq, \"Genes\", \"GO\", ['Name', 'Score'])\n", " G.name = \"PhenoMiner\"\n", " color_map = []\n", " for node in G:\n", " if node.startswith(\"GO:\"):\n", " color_map.append('#80BC00')\n", " else: \n", " color_map.append('#FFD400')\n", "\n", "\n", " try:\n", " top = nx.bipartite.sets(G)[0]\n", " except:\n", " print(\"Warning: AmbiguousSolution\")\n", " return G\n", " \n", " pos = nx.bipartite_layout(G, top)\n", "\n", " nx.draw(G, pos=pos, node_color=color_map, with_labels=True, \n", " verticalalignment='bottom', node_size=100, font_size = 12)\n", "\n", " ######################\n", "\n", "\n", " fig = go.Figure(data=[go.Table(\n", " header=dict(values=list(df_uniq.columns),\n", " fill_color='#144B39', \n", " font=dict(color='white', size=12),\n", " line_color='#FFD400',\n", " align='left'),\n", " cells=dict(values=[df_uniq.Genes, df_uniq.GO, df_uniq.Name, df_uniq.Score, df_uniq.Depth],\n", " fill_color='white', \n", " font=dict(color='#144B39', size=12), \n", " line_color='#80BC00',\n", " align='left'))\n", " ])\n", "\n", " fig.update_layout(margin = dict(t=0, l=5, r=5, b=0))\n", " fig.show()\n", " \n", " return G" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

Function: To split document into Paragraphs and lines.

\n", " The given function is a python function named read_pubmed. This function accepts five arguments named File, Method, sim_cutoff, Organism, and Print.\n", "\n", "File is a mandatory argument that specifies the name of the file to be read. Method is an optional argument that specifies the method to be used to read the file. If this argument is not specified, a warning message is displayed that mentions the options to read the file. sim_cutoff is another optional argument that sets the threshold score for the semantic similarity score, with a default value of 0.92. Organism is a list of organisms to search in the document, with a default value of ['Arabidopsis', 'thaliana', 'A. thaliana', 'thaliana', 'A. thaliana']. Finally, Print is an optional boolean argument that specifies whether to print the output or not, with a default value of True.\n", "\n", "The function first checks whether the Method argument is set or not. If not, it will display a warning message about the options for this argument and return False. Then it checks whether the whole body is accessible or not.\n", "\n", "The function then uses the Beautifulsoup module to split the document into paragraphs. If the Method is set to Offline, it reads the file using the open function and stores the data in the data variable. It then uses Beautifulsoup to parse the data and stores the result in the Bs_data variable.\n", "\n", "If the Method is set to PMC, it replaces the \"PMC\" string from the File argument with an empty string and constructs a URL to fetch the file from the PubMed Central database. It then uses the requests.get function to fetch the file from the URL and stores the result in the data variable. It then uses Beautifulsoup to parse the data and stores the result in the Bs_data variable. If the full text is not accessible, a warning message is displayed, and searching is limited to the abstract only.\n", "\n", "The function then extracts metadata such as the organism, title, PMCID, PMID, and DOI from the document using Beautifulsoup. It then processes each paragraph in the document and searches for gene-related information in it. It uses the query_go function to search for gene-related information and if found, it adds the gene-related information to a list of search information. It then creates a dictionary containing information about the paragraph, including its number, genes, and search information, and appends this dictionary to a list of dictionaries.\n", "\n", "Finally, the function checks if the list of dictionaries is not empty. If it is not empty and the Print argument is set to True, it prints the list of dictionaries in JSON format. Otherwise, it returns None.\n", "
" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 32 µs, sys: 0 ns, total: 32 µs\n", "Wall time: 35.5 µs\n" ] } ], "source": [ "%%time\n", "title_meta = ''\n", "\n", "\n", "def read_pubmed(File, Method = None, sim_cutoff = 0.92, Organism = ['Arabidopsis', 'thaliana', 'A. thaliana', 'thaliana', 'A. thaliana',], \n", " Print = True):\n", " \"\"\"\n", " Reads a PubMed file and returns a list of dictionaries, each of which contains the following keys:\n", " * `Meta_info`: A dictionary containing the metadata for the paper.\n", " * `Para_info`: A list of dictionaries, each of which contains the information for a paragraph.\n", "\n", " Args:\n", " File: The path to the PubMed file.\n", " Method: The method to use to read the file. Can be `\"Offline\"` to read an offline XML file, or `\"PMC\"` to read an online file using the given PMC ID.\n", " sim_cutoff: The similarity cutoff to use when searching for genes in the text.\n", " Organism: A list of strings containing the names of the organisms to search for.\n", " Print: A boolean value indicating whether or not to print status messages.\n", "\n", " Returns:\n", " A list of dictionaries, each of which contains the metadata and information for a paragraph in the PubMed file.\n", " \"\"\"\n", "\n", "\n", " \n", " #####################################################\n", " # Check if Whole body is accessible\n", " #####################################################\n", " fh = open(\"temp.del\", \"w\")\n", " if not Method:\n", " warnings.warn(\"\"\"Warning Message: \n", " Method not set\n", " Options:\n", " \\tOffline : Reads offline xml file.\n", " \\t\\t Example : read_pubmed('./manuscript/PMC004xxxxxx/PMC4000261.xml'\n", " \\tOnline : Reads online file using given PMC ID\n", " \\t\\t read_pubmed('4304705')\n", " \"\"\")\n", " return False\n", " \n", " \n", " #####################################################\n", " # Use Beautifulsoup for split doucment in para\n", " ##################################################### \n", " if Method == \"Offline\":\n", " with open(File, 'r') as f:\n", " data = f.read()\n", " Bs_data = BeautifulSoup(data, \"html.parser\")\n", " \n", " if Method == \"PMC\":\n", " if File.startswith(\"PMC\"):\n", " File = File.replace(\"PMC\", \"\")\n", " URL = \"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=\"+ str(File) +\"&tool=my_tool&email=my_email@example.com\"\n", " data = requests.get(URL)\n", " Bs_data = BeautifulSoup(data.text, \"html.parser\")\n", " if not Bs_data.body and Bs_data.abstract:\n", " warnings.warn(\"Full text is not accessible, searching is limited to 'Abstract' only!!!\\n Try other method type like URL or Offline XML.\")\n", " \n", " #####################################################\n", " # Mate info from the document\n", " ##################################################### \n", " # Organism\n", " Species = []\n", " for Sp in Organism:\n", " Sp_text = Bs_data(text=re.compile(Sp))\n", " if Sp_text:\n", " #print(Sp)\n", " Species.append(Sp)\n", " Species = \"; \".join(Species)\n", " \n", " if not len(Species):\n", " Species = \"**\"\n", " #return False\n", " if Print:\n", " print(f\"Species = {Species}\")\n", " \n", " \n", " \n", " # Metadata\n", " Title = ''\n", " pmid = \"\"\n", " pmc = \"\"\n", " doi = ''\n", " b_meta = Bs_data.find_all(\"article-id\")\n", " for m in b_meta:\n", " #print(m)\n", " if \"pmid\" in m.attrs.values():\n", " pmid = m.text\n", " #print(m.text)\n", "\n", " if \"pmc\" in m.attrs.values():\n", " pmc = m.text\n", " #print(m.text)\n", "\n", " if \"doi\" in m.attrs.values():\n", " doi = m.text\n", " #print(m.text)\n", "\n", "\n", " Title = str(Bs_data.find('title-group').contents[1]).replace(\"\", \"\").replace(\"\", \"\")\n", " #print(Title)\n", "\n", "\n", " b_unique = Bs_data.find_all('p')\n", "\n", " Unkown_para_num = 1\n", "\n", " List_dic = [{\"Meta_info\":{\"Title\":Title, \"Species\":Species, \"pmid\":pmid, \"pmc\":pmc, \"DOI\":doi}}]\n", "\n", " #####################################################\n", " # Process para \n", " ##################################################### \n", " for i in b_unique:\n", " #print(i.name, i.attrs,i.text, \"##########\")\n", " para_ID = \"\"\n", " #Text = i.text\n", " if 'id' in i.attrs:\n", " #print(i.attrs['id'], \"||||||||||||||||\")\n", " para_ID = i.attrs['id']\n", " else:\n", " #print(\"U\" + str(Unkown_para_num), \"##########\")\n", " para_ID = \"U\" + str(Unkown_para_num)\n", " Unkown_para_num += 1\n", " #####################################################\n", " # Split para into lines\n", " #####################################################\n", " if para_pass(i.text):\n", " dic = {\"Para_number\":para_ID, \"Genes\":\";\".join(para_pass(i.text)), \"Para\":i.text}\n", " #dic = {\"Species\":Species, \"pmid\":pmid, \"pmc\":pmc, \"Para_number\":para_ID, \"Genes\":\";\".join(para_pass(i.text)), \"Para\":i.text, \"DOI\":doi}\n", " \n", " #print(pmid, pmc, para_ID, \";\".join(para_pass(i.text)), i.text, doi, sep=\"\\t\")\n", " #print(\"|_________________|\", json.dumps(dic, indent=4, default=str), i.text, file=fh)\n", " Search_info = []\n", " for sent in sent_tokenize(i.text):\n", " query_res = query_go(sent, sim_cutoff = sim_cutoff)\n", " \n", " if query_res:\n", " #print(query_res)\n", " for q in query_res:\n", " query_dic = {\"Sentance\":sent,\n", " \"GO\":query_res[0][0], \n", " \"GO_Name\":query_res[0][1], \n", " \"Definition\":query_res[0][2], \n", " \"Depth\":query_res[0][3],\n", " \"Score\":query_res[0][4]\n", " }\n", " Search_info.append(query_dic)\n", " print(sent, dic[\"Genes\"], file=fh)\n", " print(query_dic, file=fh)\n", " print(\">\" * 10, file=fh)\n", " print(\"#\" * 10, file = fh)\n", " dic[\"Search_info\"] = Search_info\n", " List_dic.append(dic)\n", " \n", " \n", " if len(List_dic):\n", " if Print:\n", " print(json.dumps(List_dic, indent=4, default=str))\n", " #print(json.dumps(List_dic, indent=4, default=str), file=fh)\n", " return List_dic\n", " \n", " fh.close()\n", " return False\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Call function to predict GOs\n", "
\n", "

Example 1 (Bad): PMC1459476

\n", " Two Isoforms of a Divalent Metal Transporter (DMT1) in Schistosoma mansoni Suggest a Surface-associated Pathway for Iron Absorption in Schistosomes*. \n", "
" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "Paper TitleSpeciespmidPMCDOIArticle informationTwo Isoforms of a Divalent Metal Transporter (DMT1) in <italic>Schistosoma mansoni</italic> Suggest aSurface-associated Pathway for Iron Absorption in Schistosomes<xref ref-type=\"fn\" rid=\"FN1\">*</xref>Arabidopsis; thaliana; thaliana16267047145947610.1074/jbc.M511148200Description" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Warning: AmbiguousSolution\n", "CPU times: user 3min 7s, sys: 2min 6s, total: 5min 14s\n", "Wall time: 1min 2s\n" ] } ], "source": [ "%%time\n", "df_temp = read_pubmed('Example_data/PMC1459476.xml', \n", " sim_cutoff=0.90 ,Print=False, Method = \"Offline\")\n", "G = Plot_data(df_temp)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

Example 2 (Good): PMC1802096

\n", " MOR1/GEM1 plays an essential role in the plant-specific cytokinetic phragmoplast. \n", "
" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "Paper TitleSpeciespmidPMCDOIArticle informationMOR1/GEM1 plays an essential role in the plant-specific cytokinetic phragmoplastArabidopsis; thaliana; A. thaliana; thaliana; A. thaliana12198497180209610.1038/ncb844Description" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/svg+xml": [ "MOR1MOR1MOR1MOR1MOR1MOR1GEM1GEM1GEM1GEM1GEM1GEM1MOR1GEM1AGGEMGEM1GEM1GEM1GEM1GenesGO:0034314GO:0080175GO:0061640GO:0031110GO:0000913GO:0051598GO:0009555GO:0009561GO:0022619GO:0051598GO:0009561GO:0009554GO:0071586GO:0071586GO:0071586GO:0071586GO:0071586GO:0007129GO:0051026GO:0034314GOArp2/3complex-mediatedactin nucleation.phragmoplastmicrotubuleorganization.cytoskeleton-dependentcytokinesis.regulation ofmicrotubulepolymerization ordepolymerization.preprophase bandassembly.meioticrecombinationcheckpoint signaling.pollen development.megagametogenesis.generative celldifferentiation.meioticrecombinationcheckpoint signaling.megagametogenesis.megasporogenesis.CAAX-box proteinprocessing.CAAX-box proteinprocessing.CAAX-box proteinprocessing.CAAX-box proteinprocessing.CAAX-box proteinprocessing.homologouschromosome pairingat meiosis.chiasma assembly.Arp2/3complex-mediatedactin nucleation.Name0.94111992720436520.93730641987727310.95089070679097330.9403089019116130.93997032483267050.94120095526790740.96254905142802570.94512064888309790.95801694096267050.94871342684539960.93628613029146960.93722373590826360.93699294472360220.93699294472360220.93699294472360220.93660156972899490.93660156972899490.94492805140785330.93740373541239620.9392108852434374Score64453442342466666436Depth" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1min 52s, sys: 1min 16s, total: 3min 8s\n", "Wall time: 37.1 s\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%time\n", "#####################################################\n", "# Input method: Negative example\n", "#####################################################\n", "Out_def = read_pubmed('Example_data/PMC1802096.xml', \n", " Print=False, Method = \"Offline\", sim_cutoff=0.935)\n", "Plot_data(Out_def)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-----\n", "bs4 4.11.1\n", "matplotlib 3.6.2\n", "networkx 2.8.8\n", "nltk 3.8\n", "nlu 4.0.0\n", "numpy 1.23.5\n", "pandas 1.5.2\n", "plotly 5.3.1\n", "requests 2.28.1\n", "seaborn 0.12.2\n", "session_info 1.0.0\n", "sklearn 1.2.0\n", "tqdm 4.49.0\n", "-----\n", "IPython 8.7.0\n", "jupyter_client 7.4.8\n", "jupyter_core 5.1.1\n", "-----\n", "Python 3.8.15 | packaged by conda-forge | (default, Nov 22 2022, 08:49:35) [GCC 10.4.0]\n", "Linux-3.10.0-1160.24.1.el7.x86_64-x86_64-with-glibc2.10\n", "-----\n", "Session information updated at 2023-04-13 17:36\n" ] } ], "source": [ "session_info.show(html=False)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "sparknlp", "language": "python", "name": "sparknlp" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.15" } }, "nbformat": 4, "nbformat_minor": 4 }