0

new dataset

"1" "A.Kejriwal Sena"
"2" "Bhanwarlal Sharma"
"3" "Millennium Post"
"4" ""
"5" "Mushkil hai Zindagi"
"6" ""
"7" "niraj"
"8" ""
"9" "Dharmender Malik"
"10"    "S. M. Malik"
"11"    "Hocalwire"
"12"    "mansoor"
"13"    "PRRRK"
"14"    "Narendra Solanki"
"15"    "Dsekid"
"16"    "Rutvik Subhedar"
"17"    "Liberator Furiosa"
"18"    "The Anarchy Man  "
"19"    "Swamidutta"
"20"    "Phaneendra"
"21"    "Rutvik Subhedar"
"22"    "Rutvik Subhedar"
"23"    "S.Mehrotra"
"24"    "Mrigen Sharma"
"25"    "Arvind Kejriwal"
"26"    "Bitter Pills"
"27"    "Tarush Bhalla"
"28"    "Americai Narayanan"
"29"    "Rupsa Sata Durga"
"30"    "Dr Sudhakar Phulekar"
"31"    "!"
"32"    "Lala Rahul"
"33"    "Rakesh panda"
"34"    "Jayesh Mehta"
"35"    "Kuldeep Bhan"
"36"    "K K Raman"
"37"    "Kaliya"
"38"    ""
"39"    "Rana Dev Rajbanshi"
"40"    "Rahul Nirmal"
"41"    "Satya Prakash Tyagi"
"42"    "Ashutosh Singh"
"43"    "JMS:"
"44"    ""
"45"    "akif bhati"
"46"    "Arun Joseph"
"47"    "#IamAAP"
"48"    "suryanraju18 jaihind"
"49"    "PANKAJ YADAV"
"50"    "Satya Prakash Tyagi"
"51"    "Rohit TK"
"52"    "Adarsh Choudhary"
"53"    "Xtreme Nationalist"
"54"    "KCAggarwal"
"55"    "NANDKISHORE SHARMA"
"56"    "Uttam Dutta"
"57"    "P K Agarwal"
"58"    "Uttam Dutta"
"59"    "Deol"
"60"    "Rkmishra"
"61"    "Suneel Kumar"
"62"    "Rkmishra"
"63"    "Mohsin"
"64"    "Ranjeet Krishna"
"65"    "Jaynul Haq Choudhury"
"66"    "Vitthal Mundra"
"67"    "Nagesh H"
"68"    "Suneel Kumar"
"69"    "AHMAD KAMAL"
"70"    "sanjeev"
"71"    "Kaliya"
"72"    "Dinesh"
"73"    "Zoher Malkapurwala"
"74"    "suryanraju18 jaihind"
"75"    "Anand"
"76"    "Xtreme Nationalist"
"77"    "Aalamjeet Rangi"
"78"    ""
"79"    "Vimlendra Vimal"
"80"    "Rana. R.S"
"81"    "RaviVisvesvaraPrasad"
"82"    "Virupaksha hs"
"83"    "Siddharth"
"84"    "Millennium Post"
"85"    "Kishanpal"
"86"    "Santhosh Kolkunda"
"87"    "Surabhi Agarwal"
"88"    "Hocalwire"
"89"    "Rutvik Subhedar"
"90"    "Informed Indians "
"91"    "P.r Meghwanshi"
"92"    "Rajat "
"93"    "Zooni khan"
"94"    "real indian"
"95"    "Rahul Nirmal"
"96"    "P.r Meghwanshi"
"97"    "Bibhav"
"98"    ""
"99"    ""
"100"   "K Padma Rani"
"101"   "Ganesha"
"102"   "Xtreme Nationalist"
"103"   "love humanity"
"104"   "JeSuis Rohith Vemula"
"105"   "samira"
"106"   "Munendr Sharma"
"107"   "Rawat Singh Tomar"
"108"   "Raspal kaur"
"109"   "Vivek Gupta"
"110"   "Syed Zia"
"111"   "suryanraju18 jaihind"
"112"   "{AAP "
"113"   "Shravan Mansanpally"
"114"   "sghosh"
"115"   "Lakshmi Srikanth"
"116"   "Sanjaybjp"
"117"   "Razzak Ali Khan"
"118"   "Deepanita Mazumder"
"119"   "belvin vaz"
"120"   "Amit Kumar"
"121"   "#SherDilKejriwal"
"122"   "anuj"
"123"   "Sambi Reddy"
"124"   "Ranjan Kumar Jha"
"125"   "Mohsin"
"126"   "JeSuis Rohith Vemula"
"127"   "Vivek"
"128"   "Dolli"
"129"   "Bharat_Mata_Ki_Jay"
"130"   "Anantkumar"
"131"   "Flower"
"132"   "ARCHANA SINGH"
"133"   "avinash kumar"
"134"   ""
"135"   "Rajesh Mittal"
"136"   "Samik Banerjee"
"137"   "ASHWANI KUMAR GOYAL"
"138"   "Suneel Kumar"
"139"   "Shravan Mansanpally"
"140"   "rajA"
"141"   "Shravan Mansanpally"
"142"   "Mamta Yadav"
"143"   "Dr.Chintan Raval"
"144"   "suryanraju18 jaihind"
"145"   "Dr Sudhakar Phulekar"
"146"   "bilal motorwala"
"147"   "arif007"
"148"   "Dr Sudhakar Phulekar"
"149"   "Rakesh Jaiswal"
"150"   "Dr Sudhakar Phulekar"
"151"   "Prof. Satish Pandey"
"152"   "Mohammad Armanullah"
"153"   "KCAggarwal"
"154"   "Astha Mittal"
"155"   "Rajesh Sharma "
"156"   "Aditya"
"157"   "Rajesh Mittal"
"158"   "Anil Kumar"
"159"   "Niyati"
"160"   "Phronesis Partners"
"161"   "Anand Bhatt"
"162"   "CSS by Design"
"163"   "Naresh Rajput"
"164"   "Engineer Sid"
"165"   "Flower"
"166"   "Rebellion"
"167"   "Mebin"
"168"   "v.asish kumar"
"169"   "Tum se na ho payega!"
"170"   "Ranjan Singh"
"171"   "mohan munya rathod"
"172"   "DINDIGUL CA STUDENTS"
"173"   "Vibha Sachdeva"
"174"   "GT #MRX"
"175"   "Mitesh"
"176"   "Hobbes3103"
"177"   "Azad Swaraj1"
"178"   "NewsBoss.in"
"179"   "INDER MORWAL"
"180"   "kasani sukhadev"
"181"   "Mayur Panghaal"
"182"   "Chin_Chan"
"183"   "Amit Shukla"
"184"   "Mayur Panghaal"
"185"   "INDER MORWAL"
"186"   "mAt global"
"187"   "shamshad shaique"
"188"   "Niraj Bhatia "
"189"   "Aarti"
"190"   "Sudhir Bhardwaj "
"191"   "Abhishek Vishnoi"
"192"   "AAP Delhi Official"
"193"   "WeLove VidyutJammwal"
"194"   "Nagesh H"
"195"   "Vicky Singh Rajput"
"196"   "Lalit Kalra "
"197"   ""
"198"   "raju"
"199"   "knowAguy"
"200"   "Judie Custer"
"201"   "Gibreel Farishta"
"202"   "Onkar Pandey"
"203"   "Sampath Simon"
"204"   "Thammegowda M D"
"205"   "Sickular indian"
"206"   "Truthful"
"207"   "ajay Kumar nirala"
"208"   "ajay Kumar nirala"
"209"   "Farhan"
"210"   "AAPSuratVarachha"
"211"   "siva kumar jagirapu"
"212"   "uniindianews"
"213"   "Rajendra Pande"
"214"   "Kirti Bhushan"
"215"   "Sabrina MzTrueHEART"
"216"   "krishna ts"
upto  3683 rows

I applied: name2sex function on this data set and error:

Error in `$<-.data.frame`(`*tmp*`, "gender", value = c(NA, NA, NA, NA,  : 
   replacement has 3961 rows, data has 3683`.

I used:

library(qdap)
names <- as.character(new$name)
gender <- name2sex(names)
length(gender)
#[1]3961

But my new dataset has 3683 rows, not 3961. It gives output for description a and e , because it consists of single word only not for whole dataset. I tried another way ie,

library(qdap)
names <- as.character(new$name)
new$gender <- name2sex(names)
#Error in `$<-.data.frame`(`*tmp*`, "gender", value = c(NA, NA, NA, NA,  : 
# replacement has 3961 rows, data has 3683
IRTFM
  • 258,963
  • 21
  • 364
  • 487
kriti
  • 1
  • 2
  • 3
    It may be possible that some words got split up. The example you gave works fine (although there are 3 NAs in the output). So, it is better you provide an example that gives the error. – akrun Mar 11 '17 at 06:50
  • @akrun attached a sample of original dataset that gives the error. – kriti Mar 11 '17 at 09:32

1 Answers1

3

You could do

new$gender <- genderdata::ssa_national %>% 
  filter(name %in% tolower(names)) %>% 
  group_by(name) %>% 
  dplyr::summarise(
    female = sum(female), 
    male = sum(male)
  ) %>% 
  mutate(
    proportion_male = round((male/(male + female)), digits = 4), 
    proportion_female = round((female/(male + female)), digits = 4)
  ) %>% 
  mutate(gender = ifelse(proportion_female == 0.5, "either", ifelse(proportion_female > 0.5, "female", "male"))) %>%
  rename(join_name = name) %>% 
  {full_join(data_frame(name = names, join_name = tolower(names)), ., by = "join_name")} %>% 
  select(name, proportion_male, proportion_female, gender) %>% 
  .$gender

That's what's going on basically under the hood - usedebug(name2sex) to inspect that for yourself.

Also note that you are supposed to feed the function with first names and not with arbitrary names.

lukeA
  • 53,097
  • 5
  • 97
  • 100
  • Thanks,It's working but most of the name outputs NA even though feeding only first names.Is there any solution to this problem???. – kriti Mar 11 '17 at 15:29
  • The solution would be to a.) clean your list ("Bitter Pills", ""Hocalwire", ...) and b.) to use a data base that is appropriate for your locale ("Rakesh" is probably not that popular among americans), and c.) to use custom rules (""The Anarchy Man" is probably male, but that cannot be derived from a firstname<->gender data base). All in all, your problem is prly beyond the scope of stackoverflow. – lukeA Mar 11 '17 at 16:16
  • Ok. These are Indian names. name2sex is not appropriate for Indian names. – kriti Mar 11 '17 at 17:27
  • Using name2sex, we can compute the output for rakesh. name <- c("rakesh") name2sex(name) [1] M – kriti Mar 11 '17 at 17:59
  • Using name2sex, we can't compute the output for tarush. name <- c("tarush") name2sex(name) [1] . Here's the data https://www.ssa.gov/oact/babynames/limits.html ( I think this is it) – lukeA Mar 11 '17 at 19:39
  • Few names output M/F using name2sex function, but using above code it outputs NA. Any database or resources you would suggest to be used for Indian names??? – kriti Mar 12 '17 at 04:22
  • Which names? No suggestion, sorry. I'm sure you'll find some if you search. – lukeA Mar 12 '17 at 13:08
  • Amit, Rajesh, lakshmi outputs M, M, F using name2sex function. – kriti Mar 12 '17 at 18:28