-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmissingORduplicate.py
33 lines (28 loc) · 1.19 KB
/
missingORduplicate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import os
import pandas as pd
from dir_mana import dir_mana
from lister import Lister
# Custom Class Initializations
# :
# Use directory_management() class here so that we can stay organized
# and more easily access the proper directories on command
home = os.getcwd()
project = "Orthologs-Project"
user = "rgilmore"
where = dir_mana(home, project)
# Use lister() class here so that we can easily access our Master RNA Accession File
what = Lister('MAFV3.1.csv') # Always make sure this file name is correct
# Read in main file
maf = pd.read_csv('data/processed/karg-maf.csv', index_col=False, dtype=str)
# Read in organisms file and create organisms list
orgs = pd.read_csv('data/interim/Organisms.csv', index_col=False, dtype=str, header=None)
orglist = list(orgs[0])
# Create dictionary for duplicate values and blank cells/values
dupdict = {} # Dictionary of duplicates based on organisms/columns
nadict = {} # Dictionary of black or n/a cells in the main file
# Create for loop that creates dicts
for org in orglist:
dups = maf.duplicated(org, keep=False) # Get any duplicates in a column.
dupdict[org] = dups
nas = maf[org].isnull() # Get any empty spaces in a column.
nadict[org] = nas