R의 대체 문자를 기반으로 문자열 분할

Program Tip

R의 대체 문자를 기반으로 문자열 분할

programtip 2020. 10. 20. 08:05

R의 대체 문자를 기반으로 문자열 분할

문자열을 다음과 같이 분할하는 효율적인 방법을 찾으려고합니다.

"111110000011110000111000"

벡터로

[1] "11111" "00000" "1111" "0000" "111" "000"

여기서 "0"과 "1"은 대체 문자 일 수 있습니다.

시험

strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"

벤치 마크

library(stringi) 
set.seed(24)
x3 <- stri_rand_strings(1, 1e4)

akrun <- function() stri_extract_all_regex(x3, '(?:(\\w))\\1*')[[1]]
#modified @thelatemail's function to make it bit more general
thelate <- function() regmatches(x3,gregexpr("(?:(\\w))\\1*", x3, 
            perl=TRUE))[[1]]
rawr <- function() strsplit(x3, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
ananda <- function() unlist(read.fwf(textConnection(x3), 
                rle(strsplit(x3, "")[[1]])$lengths, 
                colClasses = "character"))
Colonel <- function() with(rle(strsplit(x3,'')[[1]]), 
   mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))

Cryo <- function(){
   res_vector=rep(NA_character_,nchar(x3))
  res_vector[1]=substr(x3,1,1)
  counter=1
  old_tmp=''

   for (i in 2:nchar(x3)) {
    tmp=substr(x3,i,i)
    if (tmp==old_tmp) {
    res_vector[counter]=paste0(res_vector[counter],tmp)
    } else {
    res_vector[counter+1]=tmp
    counter=counter+1
    }
  old_tmp=tmp
   }

 res_vector[!is.na(res_vector)]
  }


 richard <- function(){
     cs <- cumsum(
     rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
   )
   stri_sub(x3, c(1, head(cs + 1, -1)), cs)
  }

 nicola<-function(x) {
   indices<-c(0,which(diff(as.integer(charToRaw(x)))!=0),nchar(x))
   substring(x,indices[-length(indices)]+1,indices[-1])
 }

 richard2 <- function() {
  cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
 }

system.time(akrun())
# user  system elapsed 
# 0.003   0.000   0.003 

system.time(thelate())
#   user  system elapsed 
#  0.272   0.001   0.274 

system.time(rawr())
# user  system elapsed 
#  0.397   0.001   0.398 

system.time(ananda())
#  user  system elapsed 
# 3.744   0.204   3.949 

system.time(Colonel())
#   user  system elapsed 
#  0.154   0.001   0.154 

system.time(Cryo())
#  user  system elapsed 
# 0.220   0.005   0.226 

system.time(richard())
#  user  system elapsed 
# 0.007   0.000   0.006 

system.time(nicola(x3))
# user  system elapsed 
# 0.190   0.001   0.191

약간 더 큰 줄에

set.seed(24)
x3 <- stri_rand_strings(1, 1e6)

system.time(akrun())
#user  system elapsed 
#0.166   0.000   0.155 
system.time(richard())
#  user  system elapsed 
# 0.606   0.000   0.569 
system.time(richard2())
#  user  system elapsed 
# 0.518   0.000   0.487 

system.time(Colonel())
#  user  system elapsed 
# 9.631   0.000   9.358 


library(microbenchmark)
 microbenchmark(richard(), richard2(), akrun(), times=20L, unit='relative')
 #Unit: relative
 #     expr      min       lq     mean   median       uq      max neval cld
 # richard() 2.438570 2.633896 2.365686 2.315503 2.368917 2.124581    20   b
 #richard2() 2.389131 2.533301 2.223521 2.143112 2.153633 2.157861    20   b
 # akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20  a

참고 : 다른 방법을 실행하려고했지만 시간이 오래 걸립니다.

데이터

str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"

테마의 변형 :

x <- "111110000011110000111000"
regmatches(x,gregexpr("1+|0+",x))[[1]]
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"

substr또는 read.fwf함께 사용할 수 있습니다 rle(정규식 기반 솔루션만큼 효율적이지 않을 수 있음).

x <- "111110000011110000111000"
unlist(read.fwf(textConnection(x), 
                rle(strsplit(x, "")[[1]])$lengths, 
                colClasses = "character"))
#      V1      V2      V3      V4      V5      V6 
# "11111" "00000"  "1111"  "0000"   "111"   "000"

이 접근 방식의 한 가지 장점은 다음과 같은 경우에도 작동한다는 것입니다.

x <- paste(c(rep("a", 5), rep("b", 2), rep("c", 7),
             rep("b", 3), rep("a", 1), rep("d", 1)), collapse = "")
x
# [1] "aaaaabbcccccccbbbad"

unlist(read.fwf(textConnection(x), 
                rle(strsplit(x, "")[[1]])$lengths, 
                colClasses = "character"))
#        V1        V2        V3        V4        V5        V6 
#   "aaaaa"      "bb" "ccccccc"     "bbb"       "a"       "d"

또 다른 방법은 번갈아가는 숫자 사이에 공백을 추가하는 것입니다. 이것은 1과 0이 아닌 두 가지에 대해 작동합니다. 그런 다음 strsplit공백에 사용 하십시오.

x <- "111110000011110000111000"

(y <- gsub('(\\d)(?!\\1)', '\\1 \\2', x, perl = TRUE))
# [1] "11111 00000 1111 0000 111 000 "


strsplit(y, ' ')[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"

또는 @akrun이 지적한 것처럼 더 간결하게 :

strsplit(x, '(?<=(\\d))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"

또한 변화 \\d에 \\w또한 작품

x  <- "aaaaabbcccccccbbbad"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"      

x <- "111110000011110000111000"
strsplit(x, '(?<=(\\w))(?!\\1)', perl=TRUE)[[1]]
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"

당신은 또한 \K(명시 적으로 캡처 그룹을 사용하는 대신 \\1및 \\2) 많이 사용하지도 않고 설명하는 방법을 알지 못하는 것을 사용할 수도 있습니다.

AFAIK \\K는보고 된 일치의 시작 지점을 재설정하고 이전에 사용 된 문자는 더 이상 포함되지 않으며 기본적으로 해당 지점까지 일치하는 모든 항목을 버립니다.

x <- "1111100000222000333300011110000111000"
(z <- gsub('(\\d)\\K(?!\\1)', ' ', x, perl = TRUE))
# [1] "11111 00000 222 000 3333 000 1111 0000 111 000 "

기존 접근 방식 : 여기입니다 stringi의 통합 접근 방식은 rle().

x <- "111110000011110000111000"
library(stringi)

cs <- cumsum(
    rle(stri_split_boundaries(x, type = "character")[[1L]])$lengths
)
stri_sub(x, c(1L, head(cs + 1L, -1L)), cs)
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"

또는 다음에서 length인수를 사용할 수 있습니다.stri_sub()

rl <- rle(stri_split_boundaries(x, type = "character")[[1L]])
with(rl, {
    stri_sub(x, c(1L, head(cumsum(lengths) + 1L, -1L)), length = lengths)
})
# [1] "11111" "00000" "1111"  "0000"  "111"   "000"

효율성을 위해 업데이트 : 그것이. base::strsplit()보다 빠르다는 것을 깨달은 후 stringi::stri_split_boundaries()기본 함수 만 사용하여 이전 답변의보다 효율적인 버전이 있습니다.

set.seed(24)
x3 <- stri_rand_strings(1L, 1e6L)

system.time({
    cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
    substring(x3, c(1L, head(cs + 1L, -1L)), cs)
})
#   user  system elapsed 
#  0.686   0.012   0.697

경우에 대한 또 다른 접근 방식 mapply:

x="111110000011110000111000"

with(rle(strsplit(x,'')[[1]]), 
     mapply(function(u,v) paste0(rep(v,u), collapse=''), lengths, values))
#[1] "11111" "00000" "1111"  "0000"  "111"   "000"

그것은 실제로 OP가 찾고 있던 것이 아닙니다 (간결한 R 코드), 시도해 볼 것이라고 생각 Rcpp했으며 가장 빠른 R 기반 답변보다 상대적으로 간단하고 약 5 배 빠릅니다.

library(Rcpp)

cppFunction(
  'std::vector<std::string> split_str_cpp(std::string x) {

  std::vector<std::string> parts;

  int start = 0;

  for(int i = 1; i <= x.length(); i++) {
      if(x[i] != x[i-1]) {
        parts.push_back(x.substr(start, i-start));
        start = i;
      } 
  }

  return parts;

  }')

그리고 이것들에 대한 테스트

str1 <- "111110000011110000111000"
x1 <- "1111100000222000333300011110000111000"
x2 <- "aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"

다음 출력을 제공합니다.

> split_str_cpp(str1)
[1] "11111" "00000" "1111"  "0000"  "111"   "000"  
> split_str_cpp(x1)
 [1] "11111" "00000" "222"   "000"   "3333"  "000"   "1111"  "0000"  "111"   "000"  
> split_str_cpp(x2)
 [1] "aaaaa"   "bb"      "ccccccc" "bbb"     "a"       "d"       "11111"   "00000"   "222"     "aaa"     "bb"      "cc"      "d"       "11"     
[15] "D"       "aa"      "BB"

벤치 마크에 따르면 R 솔루션보다 약 5 ~ 10 배 빠릅니다.

akrun <- function(str1) strsplit(str1, '(?<=1)(?=0)|(?<=0)(?=1)', perl=TRUE)[[1]]

richard1 <- function(x3){
  cs <- cumsum(
    rle(stri_split_boundaries(x3, type = "character")[[1L]])$lengths
  )
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}

richard2 <- function(x3) {
  cs <- cumsum(rle(strsplit(x3, NULL)[[1L]])[[1L]])
  stri_sub(x3, c(1, head(cs + 1, -1)), cs)
}

library(microbenchmark)
library(stringi)

set.seed(24)
x3 <- stri_rand_strings(1, 1e6)

microbenchmark(split_str_cpp(x3), akrun(x3), richard1(x3), richard2(x3), unit = 'relative', times=20L)

비교:

Unit: relative
              expr      min       lq     mean   median       uq      max neval
 split_str_cpp(x3) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    20
         akrun(x3) 9.675613 8.952997 8.241750 8.689001 8.403634 4.423134    20
      richard1(x3) 5.355620 5.226103 5.483171 5.947053 5.982943 3.379446    20
      richard2(x3) 4.842398 4.756086 5.046077 5.389570 5.389193 3.669680    20

간단한 for루프 솔루션

x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=substr(x,1,1)

for (i in 2:nchar(x)) {
  tmp=substr(x,i,i)
  if (tmp==substr(x,i-1,i-1)) {
    res_vector[length(res_vector)]=paste0(res_vector[length(res_vector)],tmp)
  } else {
    res_vector[length(res_vector)+1]=tmp
  }
}

res_vector

#[1] "aaaaa"  "bb"  "ccccccc"  "bbb"  "a"  "d"  "11111"  "00000"  "222"  "aaa"  "bb"  "cc"  "d"  "11"  "D"  "aa"  "BB"

또는 미리 할당 된 결과 벡터로 조금 더 빠를 수도 있습니다.

x="aaaaabbcccccccbbbad1111100000222aaabbccd11DaaBB"
res_vector=rep(NA_character_,nchar(x))
res_vector[1]=substr(x,1,1)
counter=1
old_tmp=''

for (i in 2:nchar(x)) {
  tmp=substr(x,i,i)
  if (tmp==old_tmp) {
    res_vector[counter]=paste0(res_vector[counter],tmp)
  } else {
    res_vector[counter+1]=tmp
    counter=counter+1
  }
  old_tmp=tmp
}

res_vector[!is.na(res_vector)]

이것은 어떤가요:

s <- "111110000011110000111000"

spl <- strsplit(s,"10|01")[[1]]
l <- length(spl)
sapply(1:l, function(i) paste0(spl[i],i%%2,ifelse(i==1 | i==l, "",i%%2)))

# [1] "11111" "00000" "1111"  "0000"  "111"   "000"

참고 URL : https://stackoverflow.com/questions/29383296/split-string-based-on-alternating-character-in-r

'Program Tip' 카테고리의 다른 글

JSON을 CSV 형식으로 변환하고 변수에 저장하는 방법 (0)	2020.10.20
범위 입력 슬라이더를 세로로 표시하는 방법 (0)	2020.10.20
Python에서 * .wav 파일 읽기 (0)	2020.10.20
Capybara에서 "_blank"대상이있는 링크의 새 창으로 어떻게 전환합니까? (0)	2020.10.20
Visual Studio : 솔루션 탐색기를 현재 파일로 스크롤하는 바로 가기 (0)	2020.10.20

현재글R의 대체 문자를 기반으로 문자열 분할

programtip 프로그램 Tip을 알려드립니다.

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

programtip

R의 대체 문자를 기반으로 문자열 분할

R의 대체 문자를 기반으로 문자열 분할

최신 정보

벤치 마크

데이터

'Program Tip' 카테고리의 다른 글

'Program Tip'의 다른글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

R의 대체 문자를 기반으로 문자열 분할

R의 대체 문자를 기반으로 문자열 분할

최신 정보

벤치 마크

데이터

'Program Tip' 카테고리의 다른 글

'Program Tip'의 다른글

관련글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역