Update readme

807bf3f0 · Swastik Mishra · b030c600 · 807bf3f0 · 807bf3f0
Commit 807bf3f0 authored 6 months ago by Swastik Mishra
--- a/README.md
+++ b/README.md
@@ -20,3 +20,5 @@ The only other file you need to install is `xlsx2csv` using `pip`:
 ```
 pip install xlsx2csv
 ```
+
+Additionally, I used `ripgrep` instead of `grep` for faster extraction of subset data from EggNOG. In case you rely on `grep` make edits accordingly in the notebooks (replace commands of `rg`)
--- a/notebooks/01-01_download_data.ipynb
+++ b/notebooks/01-01_download_data.ipynb
@@ -9,8 +9,10 @@
   ]
  },
  {
-   "cell_type": "raw",
+   "cell_type": "code",
+   "execution_count": null,
   "metadata": {},
+   "outputs": [],
   "source": [
    "e6_url = \"http://eggnog6.embl.de/download/eggnog_6.0/\"\n",
    "all_trees_path = e6_url + \"e6.all_raw_trees_and_algs.tsv\"\n",
@@ -38,12 +40,10 @@
   ]
  },
  {
-   "cell_type": "raw",
-   "metadata": {
-    "vscode": {
-     "languageId": "raw"
-    }
-   },
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "%%bash -s \"$data_dir\" \"$all_trees_path\" \"$og2seqs_species_path\" \"$seq2ogs_path\" \"$gold_url\"\n",
    "pip install xlsx2csv\n",

 %% Cell type:markdown id: tags:

 ## Step 1: Downloading all data
 Downloading from eggNOG and gold should take around 30min-1h given a fast internet connection, in total there are about 150GB of data.

-%% Cell type:raw id: tags:
+%% Cell type:code id: tags:

+``` python
 e6_url = "http://eggnog6.embl.de/download/eggnog_6.0/"
 all_trees_path = e6_url + "e6.all_raw_trees_and_algs.tsv"
 og2seqs_species_path = e6_url + "e6.og2seqs_and_species.tsv"
 seq2ogs_path = e6_url + "e6.seq2ogs.tsv"
 gold_url = "https://gold.jgi.doe.gov/download?mode=site_excel"
+```

 %% Cell type:code id: tags:

 ``` python
 import os
 parent_dir = os.path.dirname(os.getcwd())
 data_dir = os.path.join(parent_dir, 'data')
 ```

 %% Cell type:markdown id: tags:

 Use bash command wget to download all files. For GOLD we need to extract a single sheet from the file, for that xlsx2csv needs to be installed via pip.

-%% Cell type:raw id: tags:
+%% Cell type:code id: tags:

+``` python
 %%bash -s "$data_dir" "$all_trees_path" "$og2seqs_species_path" "$seq2ogs_path" "$gold_url"
 pip install xlsx2csv
 mkdir -p $1

 mkdir -p $1/eggnog6
 wget $2 -P $1/eggnog6
 wget $3 -P $1/eggnog6
 wget $4 -P $1/eggnog6

 mkdir -p $1/gold
 wget $5 -O $1/gold/goldData.xlsx
+```

 %% Cell type:code id: tags:

 ``` python
 %%bash -s "$data_dir"
 xlsx2csv $1/gold/goldData.xlsx -s 4 > $1/gold/goldData_organisms.csv
 # rm $1/goldData.xlsx
 ```

 %% Cell type:markdown id: tags:

 All e6 files are quite large and take long to work with, so the first step is to extract a subset containing only Bacteria data:

 %% Cell type:code id: tags:

 ``` python
 %%bash -s "$data_dir"
 # Select bacterial OGs, and bacterial trees.
 # The second column in the trees file should be "2". There are no other single-character columns.
 # and then the first and third columns can be used to prepare a subset COG->tree mapping file.
 rg "\t2\t" $1//eggnog6/e6.all_raw_trees_and_algs.tsv > $1//eggnog6/e6.all_raw_trees_and_algs_bacteria.tsv
 # similarly here the first column is the taxonomic grouping
 rg -P '^2\t' $1//eggnog6/e6.og2seqs_and_species.tsv > $1//eggnog6/e6.og2seqs_and_species_bacteria.tsv
 ```