溫馨提示×

您好,登錄后才能下訂單哦!

密碼登錄×
登錄注冊(cè)×
其他方式登錄
點(diǎn)擊 登錄注冊(cè) 即表示同意《億速云用戶服務(wù)條款》

proc sql語句在數(shù)據(jù)清洗中的運(yùn)用

發(fā)布時(shí)間:2021-08-30 15:19:47 來源:億速云 閱讀:139 作者:chen 欄目:數(shù)據(jù)庫

本篇內(nèi)容介紹了“proc sql語句在數(shù)據(jù)清洗中的運(yùn)用”的有關(guān)知識(shí),在實(shí)際案例的操作過程中,不少人都會(huì)遇到這樣的困境,接下來就讓小編帶領(lǐng)大家學(xué)習(xí)一下如何處理這些情況吧!希望大家仔細(xì)閱讀,能夠?qū)W有所成!

libname clean "c:/books/clean";                                           /*定義永久性數(shù)據(jù)庫*/

*新建一個(gè)樣本數(shù)據(jù)one;
data one;
   input X Y Z;
datalines;
1 2 3
101 202 303
44 55 66
444 555 666
;
title "Values of X from data set ONE where X is greater than 100";
/*有條件的選擇觀測(cè)值*/

proc sql;
   select X
   from one
   where X gt 100;               
quit;

*Program 8-2;
***檢查無效的字符型的數(shù)值;
title "Checking for Invalid Character Data";
proc sql;
   select Patno,
          Gender,
          DX,
          AE
   from clean.patients
   where Gender not in ('M','F',' ')           or
         notdigit(trim(DX))and not missing(DX) or
         AE not in ('0','1',' ');
quit;

*檢查無效的數(shù)值型的數(shù)值;
title "Checking for out-of-range numeric values";
proc sql;
   select Patno,
          HR,
          SBP,
          DBP
   from clean.patients
   where HR  not between 40 and 100 and not missing(HR)    or
         SBP not between 80 and 200 and not missing(SBP)   or
         DBP not between 60 and 120 and not missing(DBP);
quit;

*基于標(biāo)準(zhǔn)差利用簡(jiǎn)單的算法來檢查數(shù)值;
title "Data values beyond two standard deviations";
proc sql;
   select Patno,
          SBP
   from clean.patients
   having SBP not between mean(SBP) - 2 * std(SBP) and
      mean(SBP) + 2 * std(SBP)                     and
      SBP is not missing;
quit;

*檢查缺失值;
options linesize=84;
title "Observations with missing values";
proc sql;
   select *
   from clean.patients
   where Patno   is missing or
         Gender  is missing or
         Visit   is missing or
         HR      is missing or
         SBP     is missing or
         DBP     is missing or
         DX      is missing or
         AE      is missing;
quit;

*檢查日期;
title "Dates before June 1, 1998 or after October 15, 1999";
proc sql;
   select Patno,
          Visit
   from clean.patients
   where Visit not between '01jun1998'd and '15oct1999'd and
         Visit is not missing;
quit;

*檢查重復(fù)值;
title "Duplicate Patient Numbers";
proc sql;
   select Patno,
          Visit
      from clean.patients
      group by Patno
      having count(Patno) gt 1;
quit;

*識(shí)別對(duì)應(yīng)多個(gè)觀察值的變量;
title "Listing of patients who do not have two visits";
proc sql;
   select Patno,
          Visit
      from clean.patients2
      group by Patno
      having count(Patno) ne 2;
quit;

*檢查兩個(gè)文件中對(duì)應(yīng)要求的序列號(hào)ID;
data one;
   input Patno X Y;
datalines;
1 69 79
2 56 .
3 66 99
5 98 87
12 13 14
;
data two;
   input Patno Z;
datalines;
1 56
3 67
4 88
5 98
13 99
;

*兩個(gè)文件都不含的ID;
title "Patient numbers not in both files";
proc sql;
   select One.patno as ID_one,
          Two.patno as ID_two
   from one full join two
   on One.patno eq Two.patno
   where One.patno is missing or Two.patno is missing;
quit;

“proc sql語句在數(shù)據(jù)清洗中的運(yùn)用”的內(nèi)容就介紹到這里了,感謝大家的閱讀。如果想了解更多行業(yè)相關(guān)的知識(shí)可以關(guān)注億速云網(wǎng)站,小編將為大家輸出更多高質(zhì)量的實(shí)用文章!

向AI問一下細(xì)節(jié)

免責(zé)聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉(zhuǎn)載和分享為主,文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng),如果涉及侵權(quán)請(qǐng)聯(lián)系站長(zhǎng)郵箱:is@yisu.com進(jìn)行舉報(bào),并提供相關(guān)證據(jù),一經(jīng)查實(shí),將立刻刪除涉嫌侵權(quán)內(nèi)容。

sql
AI