Contents

Download expression data

Contents

Download expression data#

import GEOparse
import pandas as pd
import numpy as np
import re
from pathlib import Path
import pylab as pl
import seaborn as sns
pl.rcParams['figure.figsize'] = (14, 10)
pl.rcParams['ytick.labelsize'] = 12
pl.rcParams['xtick.labelsize'] = 11
pl.rcParams['axes.labelsize'] = 23
pl.rcParams['legend.fontsize'] = 20
sns.set_style('ticks')
c1, c2, c3, c4 = sns.color_palette("Set1", 4)

# !pip install GEOparse

Dir_Expression = "1_Expression_data/"
Path(Dir_Expression).mkdir(parents=True, exist_ok=True)

Dir_WGCNA = "2_WGCNA_data/"
Path(Dir_WGCNA).mkdir(parents=True, exist_ok=True)

Dir_GRN = "3_GRN_data/"
Path(Dir_GRN).mkdir(parents=True, exist_ok=True)

Download data#

URL = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE74nnn/GSE74488/suppl/GSE74488_sc_expression.csv.gz'
!curl {URL} -O GSE74488_sc_expression.csv.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

  0 6608k    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

 52 6608k   52 3440k    0     0  259

8k      0  0:00:02  0:00:01  0:00:01 2596k

100 6608k  100 6608k    0     0  3186k      0  0:00:02  0:00:02 --:--:-- 3186k

curl: (6) Could not resolve host: GSE74488_sc_expression.csv.gz

df = pd.read_csv("GSE74488_sc_expression.csv.gz")

df.head()

	Locus	wolsc_kb2_4_1	wolsc_kb2_4_10	wolsc_kb2_4_11	wolsc_kb2_4_13	wolsc_kb2_4_14	wolsc_kb2_4_15	wolsc_kb2_4_18	wolsc_kb2_4_19	wolsc_kb2_4_22	...	wolsc_kb3_2_1	sc_0113_pa_19	sc_0113_pa_3	sc_0113_pa_44	sc_0113_pa_52	sc_0113_pa_58	sc_0113_pa_59	sc_0113_pa_60	sc_0113_pa_68	sc_0113_pa_83
0	AT1G01010	0.000000	7.702431	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.000000	0.000000
1	AT1G01020	8.378906	0.000000	0.0	4.298833	0.0	0.0	0.0	0.0	0.0	...	0.0	2.275709	0.0	0.0	0.0	0.0	3.614329	0.0	4.642478	3.406784
2	AT1G01030	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.000000	0.000000
3	AT1G01040	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.000000	0.000000
4	AT1G01046	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.000000	0.000000

5 rows × 239 columns

URL = "https://www.cell.com/cms/10.1016/j.cell.2016.04.046/attachment/ccb8f6e8-4822-4e06-9400-2eccfd98dd56/mmc4.xlsx"
!curl {URL} -O mmc4.xlsx

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

100 26662  100 26662    0     0  35313      0 --:--:-- --:--:-- --:--:-- 35267

curl: (6) Could not resolve host: mmc4.xlsx

Metadata = pd.read_excel('mmc4.xlsx', header=1)
Metadata = Metadata[["Cell", "Timepoint", "Identity"]]
Metadata.Timepoint.value_counts()

46hpc    86
3hpc     67
16hpc    55
Uncut    30
Name: Timepoint, dtype: int64

Uncut#

Metadata_Uncut = Metadata[Metadata["Timepoint"] == "Uncut"]
print(Metadata_Uncut.shape)
Dic_uncut = {a:a + "_" + c for a, b, c in Metadata_Uncut.values.tolist()}
Metadata_Uncut.head()

(30, 3)

	Cell	Timepoint	Identity
0	wolsc_kb2_4_10	Uncut	Pericycle
1	wolsc_kb2_4_1	Uncut	Vasculature
2	wolsc_kb2_4_18	Uncut	Vasculature
3	wolsc_kb2_4_22	Uncut	Vasculature
4	wolsc_kb2_4_26	Uncut	Vasculature

df_Uncut = df[['Locus'] + list(Metadata_Uncut.Cell.tolist())]
# Gene_ID, GeneName
df_Uncut.to_csv(Dir_Expression+"Expr_Uncut.csv", index=False)
df_Uncut = df_Uncut.rename(columns={"Locus":"Gene_ID"})
df_Uncut = df_Uncut.copy()
df_Uncut['Gene_name'] = df_Uncut['Gene_ID']
# Rearrange
df_Uncut_cols = ['Gene_ID', 'Gene_name']+[i for i in list(df_Uncut) if i not in ['Gene_ID', 'Gene_name']]
df_Uncut = df_Uncut[df_Uncut_cols]
df_Uncut = df_Uncut.rename(columns=Dic_uncut)
df_Uncut.to_csv(Dir_WGCNA+"WGCNA_input_Uncut.csv", index=False)
df_Uncut.head()

	Gene_ID	Gene_name	wolsc_kb2_4_10_Pericycle	wolsc_kb2_4_1_Vasculature	wolsc_kb2_4_18_Vasculature	wolsc_kb2_4_22_Vasculature	wolsc_kb2_4_26_Vasculature	wolsc_kb2_4_27_Vasculature	wolsc_kb2_4_30_Vasculature	wolsc_kb2_4_41_Vasculature	...	wolsc_kb2_4_11_Unknown	wolsc_kb2_4_13_Unknown	wolsc_kb2_4_14_Unknown	wolsc_kb2_4_15_Unknown	wolsc_kb2_4_19_Unknown	wolsc_kb2_4_24_Unknown	wolsc_kb2_4_66_Unknown	wolsc_kb2_4_76_Unknown	wolsc_kb2_4_78_Unknown	wolsc_kb2_4_80_Unknown
0	AT1G01010	AT1G01010	7.702431	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0
1	AT1G01020	AT1G01020	0.000000	8.378906	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	4.298833	0.0	0.0	0.0	0.0	0.0	0.0	11.596565	0.0
2	AT1G01030	AT1G01030	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0
3	AT1G01040	AT1G01040	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0
4	AT1G01046	AT1G01046	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0

5 rows × 32 columns

3hpc#

Metadata_3hpc = Metadata[Metadata["Timepoint"] == "3hpc"]
print(Metadata_3hpc.shape)
Dic_3hpc = {a:a + "_" + c for a, b, c in Metadata_3hpc.values.tolist()}
Metadata_3hpc.head()

(67, 3)

	Cell	Timepoint	Identity
30	sc_1228_pa_30	3hpc	Epidermis\LRC
31	wolsc_kb2_3_13	3hpc	Mixed distal
32	wolsc_kb2_3_14	3hpc	Mixed distal
33	wolsc_kb2_3_2	3hpc	Mixed distal
34	wolsc_kb2_3_27	3hpc	Mixed distal

df_3hpc = df[['Locus'] + list(Metadata_3hpc.Cell.tolist())]
df_3hpc.to_csv(Dir_Expression+"Expr_3hpc.csv", index=False)
# Gene_ID, GeneName
df_3hpc = df_3hpc.rename(columns={"Locus":"Gene_ID"})
df_3hpc = df_3hpc.copy()
df_3hpc['Gene_name'] = df_3hpc['Gene_ID']
# Rearrange
df_3hpc_cols = ['Gene_ID', 'Gene_name']+[i for i in list(df_3hpc) if i not in ['Gene_ID', 'Gene_name']]
df_3hpc = df_3hpc[df_3hpc_cols]
df_3hpc = df_3hpc.rename(columns=Dic_3hpc)
df_3hpc.to_csv(Dir_WGCNA+"WGCNA_input_3hpc.csv", index=False)
df_3hpc.head()

	Gene_ID	Gene_name	sc_1228_pa_30_Epidermis\LRC	wolsc_kb2_3_13_Mixed distal	wolsc_kb2_3_14_Mixed distal	wolsc_kb2_3_2_Mixed distal	wolsc_kb2_3_27_Mixed distal	wolsc_kb2_3_51_Mixed distal	sc_1228_pa_14_Mixed distal	sc_1228_pa_86_Mixed distal	...	sc_1228_pb_5_Unknown	sc_1228_pb_70_Unknown	sc_1228_pb_78_Unknown	sc_1228_pb_86_Unknown	sc_1228_pb_93_Unknown	sc_1228_pa_36_Unknown	sc_1228_pa_57_Unknown	sc_1228_pa_77_Unknown	sc_1228_pa_78_Unknown	sc_1228_pa_85_Unknown
0	AT1G01010	AT1G01010	0.0	0.000000	0.000000	3.829904	0.0	0.000000	0.000000	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.000000
1	AT1G01020	AT1G01020	0.0	7.092747	5.949744	7.912041	0.0	6.881387	3.328156	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	2.212596
2	AT1G01030	AT1G01030	0.0	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.000000
3	AT1G01040	AT1G01040	0.0	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	3.963564	0.0	0.0	0.000000
4	AT1G01046	AT1G01046	0.0	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.000000

5 rows × 69 columns

END