Remove middle inconsistent characters from a column header column name with r

Question

` set.seed(500)

index <- sample(1:nrow(Bands_reflectance_2017),100, replace = FALSE )

Bands_reflectance_2017 <- dput(head(Bands_reflectance_2017[1:100]))

Bands_reflectance_2017 <- structure( list( t2017.01.05T08.25.12.000000000_blue = c(5064L, 5096L, 5072L, 5048L, 5048L, 5064L), t2017.01.15T08.26.22.000000000_blue = c(418L, 487L, 480L, 449L, 449L, 480L), t2017.01.25T08.21.38.000000000_blue = c(312L, 414L, 385L, 385L, 385L, 403L), t2017.02.04T08.27.09.000000000_blue = c(5156L, 5096L, 5204L, 5240L, 5240L, 5112L), t2017.02.14T08.27.29.000000000_blue = c(2554L, 2896L, 2842L, 2776L, 2776L, 2934L), t2017.02.24T08.23.38.000000000_blue = c(2662L, 2428L, 2630L, 2644L, 2644L, 2276L), t2017.03.06T08.24.47.000000000_blue = c(340L, 403L, 409L, 407L, 407L, 391L), t2017.03.16T08.16.07.000000000_blue = c(188L, 245L, 257L, 239L, 239L, 245L), t2017.03.26T08.22.43.000000000_blue = c(379L, 397L, 381L, 345L, 345L, 387L), t2017.04.05T08.23.06.000000000_blue = c(604L, 647L, 639L, 647L, 647L, 631L), t2017.04.15T08.23.45.000000000_blue = c(311L, 382L, 376L, 379L, 379L, 425L), t2017.04.25T08.23.17.000000000_blue = c(219L, 318L, 237L, 322L, 322L, 302L), t2017.05.05T08.23.45.000000000_blue = c(979L, 1030L, 1021L, 1030L, 1030L, 985L), t2017.05.15T08.28.11.000000000_blue = c(138L, 219L, 196L, 201L, 201L, 247L), t2017.05.25T08.23.46.000000000_blue = c(655L, 779L, 736L, 752L, 752L, 777L), t2017.06.04T08.25.50.000000000_blue = c(318L, 419L, 384L, 343L, 343L, 400L), t2017.06.14T08.28.06.000000000_blue = c(397L, 387L, 407L, 432L, 432L, 347L), t2017.06.24T08.26.00.000000000_blue = c(336L, 450L, 402L, 395L, 395L, 388L), t2017.07.04T08.23.42.000000000_blue = c(502L, 538L, 512L, 495L, 495L, 505L), t2017.07.09T08.23.09.000000000_blue = c(568L, 597L, 639L, 611L, 611L, 577L), t2017.07.19T08.23.43.000000000_blue = c(479L, 517L, 536L, 529L, 529L, 528L), t2017.07.24T08.23.44.000000000_blue = c(409L, 499L, 499L, 473L, 473L, 482L), t2017.07.29T08.26.12.000000000_blue = c(781L, 801L, 810L, 823L, 823L, 735L), t2017.08.03T08.26.43.000000000_blue = c(517L, 579L, 560L, 583L, 583L, 564L), t2017.08.08T08.23.41.000000000_blue = c(575L, 654L, 650L, 650L, 650L, 602L), t2017.08.13T08.23.44.000000000_blue = c(623L, 679L, 708L, 698L, 698L, 677L), t2017.08.18T08.25.16.000000000_blue = c(614L, 651L, 648L, 597L, 597L, 651L), t2017.08.23T08.22.22.000000000_blue = c(554L, 613L, 559L, 524L, 524L, 596L), t2017.08.28T08.28.01.000000000_blue = c(769L, 814L, 772L, 744L, 744L, 828L), t2017.09.02T08.23.42.000000000_blue = c(756L, 761L, 763L, 783L, 783L, 742L), t2017.09.07T08.23.30.000000000_blue = c(807L, 865L, 826L, 838L, 838L, 837L), t2017.09.12T08.23.35.000000000_blue = c(861L, 869L, 876L, 904L, 904L, 869L), t2017.09.22T08.23.38.000000000_blue = c(4640L, 3780L, 4340L, 4728L, 4728L, 3060L), t2017.09.27T08.16.41.000000000_blue = c(778L, 777L, 811L, 839L, 839L, 752L), t2017.10.02T08.17.41.000000000_blue = c(766L, 868L, 851L, 857L, 857L, 799L), t2017.10.07T08.24.51.000000000_blue = c(767L, 816L, 839L, 830L, 830L, 753L), t2017.10.12T08.24.39.000000000_blue = c(678L, 688L, 706L, 750L, 750L, 627L), t2017.10.17T08.15.32.000000000_blue = c(678L, 769L, 804L, 797L, 797L, 711L), t2017.10.22T08.21.34.000000000_blue = c(3146L, 3134L, 3128L, 3160L, 3160L, 3118L), t2017.10.27T08.23.27.000000000_blue = c(612L, 697L, 721L, 697L, 697L, 708L), t2017.11.01T08.24.41.000000000_blue = c(941L, 982L, 1001L, 1010L, 1010L, 999L), t2017.11.06T08.20.50.000000000_blue = c(670L, 824L, 836L, 824L, 824L, 785L), t2017.11.11T08.27.40.000000000_blue = c(720L, 817L, 839L, 807L, 807L, 801L), t2017.11.16T08.16.16.000000000_blue = c(9824L, 9744L, 9792L, 9744L, 9744L, 9536L), t2017.11.21T08.17.00.000000000_blue = c(749L, 841L, 838L, 738L, 738L, 830L), t2017.11.26T08.25.13.000000000_blue = c(735L, 863L, 832L, 713L, 713L, 899L), t2017.12.01T08.20.22.000000000_blue = c(674L, 836L, 816L, 800L, 800L, 771L), t2017.12.06T08.19.42.000000000_blue = c(2742L, 2770L, 2742L, 2762L, 2762L, 2798L), t2017.12.11T08.19.00.000000000_blue = c(582L, 745L, 734L, 654L, 654L, 743L), t2017.12.16T08.23.19.000000000_blue = c(926L, 1054L, 1001L, 946L, 946L, 1054L), t2017.12.21T08.20.53.000000000_blue = c(7432L, 7484L, 7456L, 7404L, 7404L, 7484L), t2017.12.26T08.20.39.000000000_blue = c(629L, 724L, 762L, 738L, 738L, 731L), t2017.12.31T08.20.04.000000000_blue = c(667L, 765L, 762L, 718L, 718L, 765L), t2017.01.05T08.25.12.000000000_green = c(5224L, 5196L, 5208L, 5152L, 5152L, 5172L), t2017.01.15T08.26.22.000000000_green = c(837L, 938L, 907L, 858L, 858L, 927L), t2017.01.25T08.21.38.000000000_green = c(735L, 808L, 770L, 770L, 770L, 836L), t2017.02.04T08.27.09.000000000_green = c(5424L, 5492L, 5488L, 5536L, 5536L, 5832L), t2017.02.14T08.27.29.000000000_green = c(3050L, 3094L, 3108L, 3228L, 3228L, 2900L), t2017.02.24T08.23.38.000000000_green = c(2664L, 2450L, 2598L, 2646L, 2646L, 2340L), t2017.03.06T08.24.47.000000000_green = c(702L, 735L, 749L, 727L, 727L, 729L), t2017.03.16T08.16.07.000000000_green = c(632L, 685L, 708L, 685L, 685L, 703L), t2017.03.26T08.22.43.000000000_green = c(744L, 841L, 806L, 809L, 809L, 818L), t2017.04.05T08.23.06.000000000_green = c(1030L, 1036L, 1044L, 1050L, 1050L, 1040L), t2017.04.15T08.23.45.000000000_green = c(634L, 720L, 708L, 699L, 699L, 751L), t2017.04.25T08.23.17.000000000_green = c(619L, 698L, 716L, 723L, 723L, 687L), t2017.05.05T08.23.45.000000000_green = c(1340L, 1368L, 1374L, 1404L, 1404L, 1354L), t2017.05.15T08.28.11.000000000_green = c(525L, 633L, 619L, 612L, 612L, 626L), t2017.05.25T08.23.46.000000000_green = c(1042L, 1118L, 1078L, 1028L, 1028L, 1148L), t2017.06.04T08.25.50.000000000_green = c(655L, 778L, 783L, 769L, 769L, 813L), t2017.06.14T08.28.06.000000000_green = c(772L, 829L, 838L, 810L, 810L, 822L), t2017.06.24T08.26.00.000000000_green = c(741L, 888L, 848L, 798L, 798L, 865L), t2017.07.04T08.23.42.000000000_green = c(867L, 918L, 912L, 846L, 846L, 946L), t2017.07.09T08.23.09.000000000_green = c(936L, 1001L, 1012L, 972L, 972L, 985L), t2017.07.19T08.23.43.000000000_green = c(848L, 911L, 925L, 915L, 915L, 903L), t2017.07.24T08.23.44.000000000_green = c(855L, 907L, 947L, 913L, 913L, 937L), t2017.07.29T08.26.12.000000000_green = c(1096L, 1106L, 1134L, 1150L, 1150L, 1116L), t2017.08.03T08.26.43.000000000_green = c(987L, 1072L, 1040L, 1030L, 1030L, 1021L), t2017.08.08T08.23.41.000000000_green = c(996L, 1011L, 1001L, 1011L, 1011L, 1032L), t2017.08.13T08.23.44.000000000_green = c(1006L, 1100L, 1082L, 1078L, 1078L, 1092L), t2017.08.18T08.25.16.000000000_green = c(977L, 1034L, 1032L, 976L, 976L, 1020L), t2017.08.23T08.22.22.000000000_green = c(976L, 1054L, 1044L, 985L, 985L, 1072L), t2017.08.28T08.28.01.000000000_green = c(1162L, 1176L, 1188L, 1150L, 1150L, 1200L), t2017.09.02T08.23.42.000000000_green = c(1136L, 1152L, 1158L, 1176L, 1176L, 1130L), t2017.09.07T08.23.30.000000000_green = c(1122L, 1166L, 1174L, 1194L, 1194L, 1162L), t2017.09.12T08.23.35.000000000_green = c(1158L, 1170L, 1168L, 1180L, 1180L, 1146L), t2017.09.22T08.23.38.000000000_green = c(3304L, 3218L, 3072L, 3580L, 3580L, 4148L), t2017.09.27T08.16.41.000000000_green = c(1172L, 1228L, 1242L, 1224L, 1224L, 1172L), t2017.10.02T08.17.41.000000000_green = c(1148L, 1224L, 1220L, 1200L, 1200L, 1164L), t2017.10.07T08.24.51.000000000_green = c(1120L, 1164L, 1160L, 1148L, 1148L, 1114L), t2017.10.12T08.24.39.000000000_green = c(1124L, 1158L, 1166L, 1144L, 1144L, 1090L), t2017.10.17T08.15.32.000000000_green = c(1092L, 1190L, 1180L, 1154L, 1154L, 1146L), t2017.10.22T08.21.34.000000000_green = c(3140L, 3124L, 3142L, 3134L, 3134L, 3096L), t2017.10.27T08.23.27.000000000_green = c(1064L, 1104L, 1116L, 1078L, 1078L, 1098L), t2017.11.01T08.24.41.000000000_green = c(1298L, 1310L, 1344L, 1344L, 1344L, 1318L), t2017.11.06T08.20.50.000000000_green = c(1114L, 1240L, 1220L, 1164L, 1164L, 1212L), t2017.11.11T08.27.40.000000000_green = c(1182L,1278L, 1278L, 1192L, 1192L, 1284L), t2017.11.16T08.16.16.000000000_green = c(8872L, 8728L, 8816L, 8904L, 8904L, 8600L), t2017.11.21T08.17.00.000000000_green = c(1166L, 1268L, 1250L, 1158L, 1158L, 1260L), t2017.11.26T08.25.13.000000000_green = c(1138L, 1272L, 1288L, 1240L, 1240L, 1278L)), row.names = c(NA, 6L), class = "data.frame") `

I have a dataframe of dates for per specific bands with 534 column headers as follow:

"t2017-12-31T08:20:04.000000000_red_edge_3"
"t2017-02-04T08:27:09.000000000_nir_1"    
"t2017-12-31T08:20:04.000000000_swir_2"

Now, I want to remove everything and only remain with the date and the band name e.g in column header one and two, I want to only remain with

"2017-12-31_red_edge_3"
"2017-02-04_nir_1"

I have about 534 columns and most characters are not consistent because each date time is different and more band examples not similar to what is shown here for all the 534 records, so I was only able to remove repetitive characters such as "T08", ":","t" and "000000000" which are available in all the columns. How do I remove the values between the date and the band characters when they vary per each column and so I cannot use :

for ( col in 1:ncol(Bands_reflectance_2017[5:534])){
  colnames(Bands_reflectance_2017)[5:534] <-  sub(".000000000", "", colnames(Bands_reflectance_2017)[5:534]) #Remove .000000000
  
}

etc

Also at the end of the day, I want to replace each bandname with a band coding system such as assign "nir-1" as "B8" and "12" as the month of "December" so that for example my first and second column header reads:

B7_December31	B8_February02
Cell 1	Cell 2
Cell 3	Cell 4

"B7_December31", "B8_February02" which are better naming to run in a random forest. Because I am running into problems of

Error in eval(predvars, data, env) : object '"t2017-12-31T08:20:04.000000000_red_edge_3"' not found

if I keep the naming convention in the example

I have the following column header names in my dataframe (Bands_reflectance_2017) of 534 columns :

"t2017-01-25T08:21:38.000000000_blue"	"t2017-08-23T08:22:22.000000000_green"
Cell 1	Cell 2
Cell 3	Cell 4

I want to remove everything except the date and band name e.g "2017_01_25_blue"

I tried:

for ( col in 1:ncol(Bands_reflectance_2017[5:534])){
  colnames(Bands_reflectance_2017)[5:534] <-  sub("T08", "", colnames(Bands_reflectance_2017)[5:534]) #Remove T08

But as some of the characters I want to remove are unique per each 534 columns, I am not sure how to remove them

I expect this at the end of the day:

2017_01_25_blue	2017_08_23_green
Cell 1	Cell 2
Cell 3	Cell 4

The later

"B2_December31", B3_August23
Cell 1
Cell 3

I also tried this :

substr(colnames(Bands_Reflectance_2017[2:335]),2,11)

What is the best way to do it? I am fairly new to programming and to r.

What language is this? – Andrej Kesely Nov 17 '22 at 22:53 — Andrej Kesely, Nov 17 '22 at 22:53

John Polo · Accepted Answer · 2022-11-18T18:35:13.783

0

Thanks for sharing your code and data. Most people won't download random files. In the future you can share data with dput(data) or a smaller version with dput(head(data)).

library(stringr)
library(lubridate)

# Using the data frame that you provided with dput, which I call "df1" here
# You'll probably have to adjust the numbers between the [] because your
# data frame is vastly different from what I have and I'm not sure I have 
# the write number, but since you said 534 columns, I'm using that. 
df1 <- names(df1)[1:534]

band_names <- rep(NA, length(df1))

# This is messy. I'm sure someone who knows stringr or 
# regex better has a neater way to do this.
# str_locate will find positions in a string and return the numeric value of the position
# str_sub uses positions to pull substrings
# gsub replaces patterns
# What this does is find the positions of the dates or labels, 
# pulls out the substring, replaces things not needed 
# (like "-" I used to mark positions), changed the number for date 
# to something numeric so that month() can be switched from number to text.
for(i in 1:length(df1)) { 
  band_names[i] <- paste0(as.character(month(as.numeric(gsub("\\.","",
                          str_sub(df1[i],str_locate(df1[i],"\\.[0-9]{2}")))),
                          label=T, abbr = F)),gsub("T","",str_sub(df1[i],str_locate(df1[i],
                          "\\.[0-9]{2}T"))),"_",
                          str_sub(df1[i],str_locate(df1[i],"[a-z]{3,}.+")))}


# You can look at the results
band_names
[1] "Dec-12_red_edge_3" "Feb-02_nir_1"      "Dec-12_swir_2"   

# Split up band_names to replace the band label with number
band_out <- str_sub(band_names, 7)
band_stay <- str_sub(band_names, 1, 6)

# Made data frame up for the few example lines. I'm not downloading the CSV and I'm not going to find out the actual band names, labels, and numbers. 
fake_bands <- data.frame(label = c("red_edge_3", "nir_1", "swir_2"), number = c("b1","b3","b2"))

# Change out labels for the numbers
band_replace <- fake_bands[match(band_out, fake_bands$label), "number"]

new_names <- paste0(band_stay, band_replace)
new_name
[1] "Dec-12_b1" "Feb-02_b3" "Dec-12_b2"

# Again, you might have to adjust the numbers in []
names(df1)[1:534] <- new_names

You're going to have to expand/replace the fake_bands data frame I made here with a data frame that has two columns. One column should have the labels, like "red_edge_3", and the other should have the appropriate band number.

edited Nov 18 '22 at 18:35

answered Nov 18 '22 at 02:51

John Polo

547
1
8
25

I can't tell if you meant to type more or not? Can't tell what the error is. Can you add `dput(head(yourdf))` to the question? – John Polo Nov 18 '22 at 16:02
Thank you very much, however I get this error when I put my actual df of 534 columns: "Error in 1:(Bands_reflectance_2017) : NA/NaN argument In addition: Warning message: In 1:(Bands_reflectance_2017) : numerical expression has 534 elements: only the first used" – Nangula Nov 18 '22 at 16:07
I could do that but it seems I maxed out my edits due to many pending edits. – Nangula Nov 18 '22 at 16:08
Maxed out your comments? Or edits? – John Polo Nov 18 '22 at 16:09
Sorry I mean edits – Nangula Nov 18 '22 at 16:11
Are you using `read.csv()` to read in your data? – John Polo Nov 18 '22 at 16:18
@ John Polo Yes – Nangula Nov 18 '22 at 16:23
Updated the code. Next time, please include a `dput` of your data. The way you showed the data doesn't actually match what was in the file. – John Polo Nov 18 '22 at 16:33
@ John Polo, I added the dput on the data. I am not so sure I got it right, and if you are able to reproduce the data on your side or I messed up the question more – Nangula Nov 18 '22 at 17:22
@ John It works, thank very much, except for one thing, I need the full date to stay, which means we must keep the two numbers before T08. The code gives me "Dec.12_swir_2", whilst I need December as a month and the specific day of the month plus the bandname. Eg. for: "t2017.01.05T08.25.12.000000000_blue" , "t2017.01.15T08.26.22.000000000_blue" , "t2017.01.25T08.21.38.000000000_blue", I need them with the day of the month information like = "Jan.05_blue", "January.15_blue", "January.25_blue". Otherwise the dates will not be unique for the Random Forest analysis I want to do. – Nangula Nov 18 '22 at 18:08
I changed the loop so that the names are full names for the months and not abbreviations and the day of the month should also be preserved. I also changed the first line that creates the `df1` to try to match what you provided in the `dput`, which was vastly different from what I had when I read in the file. – John Polo Nov 18 '22 at 18:36
If this works, please use the check mark for the accepted answer. Thanks. – John Polo Nov 20 '22 at 15:02

Remove middle inconsistent characters from a column header column name with r

1 Answers1